# Random Forest Model

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import tensor
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

## Load Data

In [2]:
def load_data_and_filter_ids(file_path):
    """
    Loads data from a CSV file, checks for NaN values in 'step' column grouped by 'series_id',
    and returns a list of 'series_id' values that do not contain NaNs.
    
    :param file_path: Path to the CSV file.
    :return: List of series IDs without NaN values in the 'step' column.
    """
    # Load data from CSV
    train_events = pd.read_csv(file_path)

    # Group by 'series_id' and check for NaN values in 'step' column
    series_has_nan = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())

    # Get list of series IDs that do not contain NaN values
    train_ids = series_has_nan[~series_has_nan].index.tolist()

    return train_ids

# Usage example:
file_path = "../data/train_events.csv"
train_ids = load_data_and_filter_ids(file_path)

## Feature Engineering

In [3]:
def get_multi_light_series(series_ids):
    """
    Fetches and processes a dataset for the given series IDs.

    :param series_ids: List of series IDs to fetch.
    :return: Processed DataFrame with added features.
    """
    print(f'Fetching series IDs: {series_ids} \n')
    file_path = "../data/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet"
    multi_series = pd.read_parquet(file_path, filters=[('series_id', 'in', series_ids)])
    multi_series = multi_series.astype({'series_id': 'category', 'step': 'int16', 'awake': 'int16'})
    multi_series = add_features(multi_series)

    return multi_series

def add_features(df):
    """
    Adds various features to the DataFrame.

    :param df: DataFrame to which features are added.
    :return: DataFrame with added features.
    """
    df = add_time_features(df)
    df = add_interaction_features(df)
    df = add_rolling_features(df, periods=6)  # 1/2 minute

    return df

def add_time_features(df):
    """ Adds time-related features to the DataFrame. """
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df["hour"] = df["timestamp"].dt.hour
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    return df

def add_interaction_features(df):
    """ Adds interaction features to the DataFrame. """
    df["anglez_times_enmo"] = abs(df["anglez"]) * df["enmo"]
    return df

def add_rolling_features(df, periods):
    """ Adds rolling features to the DataFrame. """
    # Define operations to be applied
    operations = ["mean", "min", "max", "std"]
    columns = ["anglez", "enmo"]

    for column in columns:
        for operation in operations:
            df[f"{column}_{operation}"] = (
                df[column].rolling(periods, center=True).agg(operation).bfill().ffill().astype('float32')
            )

        # Differential features
        df[f"{column}_diff"] = (
            df.groupby('series_id', observed=True)[column].diff(periods=periods).bfill()
        )
        df[f"{column}_diff_rolling"] = (
            df[f"{column}_diff"].rolling(periods, center=True).mean().bfill().ffill().astype('float32')
        )

    return df


In [4]:
%time train_all = get_multi_light_series(train_ids[:8])
print(f'memory usage: {train_all.memory_usage().sum() / 1024**2: .2f} MB')

Fetching series IDs: ['08db4255286f', '0a96f4993bd7', '0cfc06c129cc', '1087d7b0ff2e', '10f8bc1f7b07', '18b61dd5aae8', '29c75c018220', '31011ade7c0a'] 



CPU times: total: 12.1 s
Wall time: 17.4 s
memory usage:  219.76 MB


In [5]:
train_all.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,hour,dayofweek,anglez_times_enmo,anglez_mean,...,anglez_max,anglez_std,anglez_diff,anglez_diff_rolling,enmo_mean,enmo_min,enmo_max,enmo_std,enmo_diff,enmo_diff_rolling
0,08db4255286f,0,2018-11-05 14:00:00+00:00,-30.845301,0.0447,1,14,0,1.378785,-33.749619,...,-30.845301,1.463509,0.331902,0.331902,0.055533,0.0443,0.0768,0.013588,0.0626,0.0626
1,08db4255286f,1,2018-11-05 14:00:05+00:00,-34.181801,0.0443,1,14,0,1.514254,-33.749619,...,-30.845301,1.463509,0.331902,0.331902,0.055533,0.0443,0.0768,0.013588,0.0626,0.0626
2,08db4255286f,2,2018-11-05 14:00:10+00:00,-33.877102,0.0483,1,14,0,1.636264,-33.749619,...,-30.845301,1.463509,0.331902,0.331902,0.055533,0.0443,0.0768,0.013588,0.0626,0.0626
3,08db4255286f,3,2018-11-05 14:00:15+00:00,-34.282101,0.068,1,14,0,2.331183,-33.749619,...,-30.845301,1.463509,0.331902,0.331902,0.055533,0.0443,0.0768,0.013588,0.0626,0.0626
4,08db4255286f,4,2018-11-05 14:00:20+00:00,-34.385799,0.0768,1,14,0,2.64083,-33.694302,...,-30.513399,1.595555,0.331902,0.331902,0.065967,0.0443,0.1073,0.023801,0.0626,0.0626


## Data Preprocessing

In [6]:
def scale_features_and_extract_target(df, feature_names, target_name):
    """
    Scales the features of the dataset and extracts the target variable.

    :param df: DataFrame containing the dataset.
    :param feature_names: List of feature names to be scaled.
    :param target_name: Name of the target variable.
    :return: Tuple of scaled features array and target variable array.
    """
    # Initialize the scaler
    scaler = StandardScaler()

    # Scale the features
    df_features_scaled = scaler.fit_transform(df[feature_names])

    # Extract the target variable
    df_target = df[target_name].values

    return df_features_scaled, df_target

# Example usage:
features = ["step", "hour", "dayofweek", "anglez_times_enmo",
            "anglez", "anglez_diff", "anglez_mean", "anglez_min", "anglez_max", "anglez_std", "anglez_diff_rolling",
            "enmo", "enmo_diff", "enmo_mean", "enmo_min", "enmo_max", "enmo_std", "enmo_diff_rolling"]
target = 'awake'

# Assuming 'train_all' is your DataFrame
df_train_X_scaled, df_train_y = scale_features_and_extract_target(train_all, features, target)

## Split Data

In [7]:
def prepare_data_and_split(df_features, df_target, split_ratio=0.8, convert_to_tensor=True):
    """
    Converts feature and target dataframes into PyTorch tensors and splits them into training and validation sets.

    :param df_features: DataFrame or array containing the feature data.
    :param df_target: DataFrame or array containing the target data.
    :param split_ratio: Float representing the proportion of the dataset to include in the train split.
    :param convert_to_tensor: Boolean 
    :return: Tuples of tensors (X_train, y_train), (X_val, y_val).
    """

    if convert_to_tensor:
        X = tensor(df_features, dtype=torch.float32)
        y = tensor(df_target, dtype=torch.long)
    else:
        X, y = df_features, df_target

    # Split the data
    split_index = int(len(X) * split_ratio)
    X_train, X_val = X[:split_index], X[split_index:]
    y_train, y_val = y[:split_index], y[split_index:]

    return (X_train, y_train), (X_val, y_val)

In [8]:
(X_train, y_train), (X_val, y_val) = prepare_data_and_split(df_train_X_scaled, df_train_y, convert_to_tensor=False)

# Checking the shapes
print("Train shapes (X, y):", X_train.shape, y_train.shape)
print("Validation shapes (X, y):", X_val.shape, y_val.shape)

Train shapes (X, y): (2275920, 18) (2275920,)
Validation shapes (X, y): (568980, 18) (568980,)


## Model

In [9]:
random_forest_classifier = RandomForestClassifier(n_estimators=50,
                                    min_samples_leaf=300,
                                    random_state=42,n_jobs=-1)


In [10]:
random_forest_classifier

## Training

In [11]:
random_forest_classifier.fit(X_train, y_train)

## Predictions

In [19]:
predictions = random_forest_classifier.predict(X_val)
predictions

array([0, 0, 0, ..., 1, 1, 1], dtype=int16)