In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
import random
import gc
from itertools import groupby

In [2]:
# Setting random seed for reproducibility
np.random.seed(0)

# Configuring pandas display options
pd.set_option('display.precision', 2)
pd.set_option('display.float_format', '{:.2f}'.format)

# Determining the default device based on availability
def_device = (
    'mps' if torch.backends.mps.is_available() 
    else 'cuda' if torch.cuda.is_available() 
    else 'cpu'
)

def_device

'cuda'

## Load Data

In [3]:
def load_data_and_filter_ids(file_path):
    """
    Loads data from a CSV file, checks for NaN values in 'step' column grouped by 'series_id',
    and returns a list of 'series_id' values that do not contain NaNs.
    
    :param file_path: Path to the CSV file.
    :return: List of series IDs without NaN values in the 'step' column.
    """
    # Load data from CSV
    train_events = pd.read_csv(file_path)

    # Group by 'series_id' and check for NaN values in 'step' column
    series_has_nan = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())

    # Get list of series IDs that do not contain NaN values
    train_ids = series_has_nan[~series_has_nan].index.tolist()

    return train_ids

# Usage example:
file_path = "../data/train_events.csv"
train_ids = load_data_and_filter_ids(file_path)

## Feature Engineering

In [4]:
def get_multi_light_series(series_ids):
    """
    Fetches and processes a dataset for the given series IDs.

    :param series_ids: List of series IDs to fetch.
    :return: Processed DataFrame with added features.
    """
    print(f'Fetching series IDs: {series_ids} \n')
    file_path = "../data/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet"
    multi_series = pd.read_parquet(file_path, filters=[('series_id', 'in', series_ids)])
    multi_series = multi_series.astype({'series_id': 'category', 'step': 'int16', 'awake': 'int16'})
    multi_series = add_features(multi_series)

    return multi_series

def add_features(df):
    """
    Adds various features to the DataFrame.

    :param df: DataFrame to which features are added.
    :return: DataFrame with added features.
    """
    df = add_time_features(df)
    df = add_interaction_features(df)
    df = add_rolling_features(df, periods=6)  # 1/2 minute

    return df

def add_time_features(df):
    """ Adds time-related features to the DataFrame. """
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df["hour"] = df["timestamp"].dt.hour
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    return df

def add_interaction_features(df):
    """ Adds interaction features to the DataFrame. """
    df["anglez_times_enmo"] = abs(df["anglez"]) * df["enmo"]
    return df

def add_rolling_features(df, periods):
    """ Adds rolling features to the DataFrame. """
    # Define operations to be applied
    operations = ["mean", "min", "max", "std"]
    columns = ["anglez", "enmo"]

    for column in columns:
        for operation in operations:
            df[f"{column}_{operation}"] = (
                df[column].rolling(periods, center=True).agg(operation).bfill().ffill().astype('float32')
            )

        # Differential features
        df[f"{column}_diff"] = (
            df.groupby('series_id', observed=True)[column].diff(periods=periods).bfill()
        )
        df[f"{column}_diff_rolling"] = (
            df[f"{column}_diff"].rolling(periods, center=True).mean().bfill().ffill().astype('float32')
        )

    return df


In [5]:
%time train_all = get_multi_light_series(train_ids[:8])
print(f'memory usage: {train_all.memory_usage().sum() / 1024**2: .2f} MB')

Fetching series IDs: ['08db4255286f', '0a96f4993bd7', '0cfc06c129cc', '1087d7b0ff2e', '10f8bc1f7b07', '18b61dd5aae8', '29c75c018220', '31011ade7c0a'] 



CPU times: total: 11.5 s
Wall time: 16.8 s
memory usage:  219.76 MB


In [6]:
train_all.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,hour,dayofweek,anglez_times_enmo,anglez_mean,...,anglez_max,anglez_std,anglez_diff,anglez_diff_rolling,enmo_mean,enmo_min,enmo_max,enmo_std,enmo_diff,enmo_diff_rolling
0,08db4255286f,0,2018-11-05 14:00:00+00:00,-30.85,0.04,1,14,0,1.38,-33.75,...,-30.85,1.46,0.33,0.33,0.06,0.04,0.08,0.01,0.06,0.06
1,08db4255286f,1,2018-11-05 14:00:05+00:00,-34.18,0.04,1,14,0,1.51,-33.75,...,-30.85,1.46,0.33,0.33,0.06,0.04,0.08,0.01,0.06,0.06
2,08db4255286f,2,2018-11-05 14:00:10+00:00,-33.88,0.05,1,14,0,1.64,-33.75,...,-30.85,1.46,0.33,0.33,0.06,0.04,0.08,0.01,0.06,0.06
3,08db4255286f,3,2018-11-05 14:00:15+00:00,-34.28,0.07,1,14,0,2.33,-33.75,...,-30.85,1.46,0.33,0.33,0.06,0.04,0.08,0.01,0.06,0.06
4,08db4255286f,4,2018-11-05 14:00:20+00:00,-34.39,0.08,1,14,0,2.64,-33.69,...,-30.51,1.6,0.33,0.33,0.07,0.04,0.11,0.02,0.06,0.06


## Data Preprocessing

In [7]:
def scale_features_and_extract_target(df, feature_names, target_name):
    """
    Scales the features of the dataset and extracts the target variable.

    :param df: DataFrame containing the dataset.
    :param feature_names: List of feature names to be scaled.
    :param target_name: Name of the target variable.
    :return: Tuple of scaled features array and target variable array.
    """
    # Initialize the scaler
    scaler = StandardScaler()

    # Scale the features
    df_features_scaled = scaler.fit_transform(df[feature_names])

    # Extract the target variable
    df_target = df[target_name].values

    return df_features_scaled, df_target

# Example usage:
features = ["step", "hour", "dayofweek", "anglez_times_enmo",
            "anglez", "anglez_diff", "anglez_mean", "anglez_min", "anglez_max", "anglez_std", "anglez_diff_rolling",
            "enmo", "enmo_diff", "enmo_mean", "enmo_min", "enmo_max", "enmo_std", "enmo_diff_rolling"]
target = 'awake'

# Assuming 'train_all' is your DataFrame
df_train_X_scaled, df_train_y = scale_features_and_extract_target(train_all, features, target)

## Split Data

In [8]:
def prepare_data_and_split(df_features, df_target, split_ratio=0.8):
    """
    Converts feature and target dataframes into PyTorch tensors and splits them into training and validation sets.

    :param df_features: DataFrame or array containing the feature data.
    :param df_target: DataFrame or array containing the target data.
    :param split_ratio: Float representing the proportion of the dataset to include in the train split.
    :return: Tuples of tensors (X_train, y_train), (X_val, y_val).
    """
    # Convert to PyTorch tensors
    X = tensor(df_features, dtype=torch.float32)
    y = tensor(df_target, dtype=torch.long)

    # Split the data
    split_index = int(len(X) * split_ratio)
    X_train, X_val = X[:split_index], X[split_index:]
    y_train, y_val = y[:split_index], y[split_index:]

    return (X_train, y_train), (X_val, y_val)

In [9]:

(X_train, y_train), (X_val, y_val) = prepare_data_and_split(df_train_X_scaled, df_train_y)

# Checking the shapes
print("Train shapes (X, y):", X_train.shape, y_train.shape)
print("Validation shapes (X, y):", X_val.shape, y_val.shape)

Train shapes (X, y): torch.Size([2275920, 18]) torch.Size([2275920])
Validation shapes (X, y): torch.Size([568980, 18]) torch.Size([568980])


## Data Loader

In [10]:
class TimeSeriesDataset(Dataset):
    """ Custom Dataset for handling time series data. """
    def __init__(self, X, y):
        self.X, self.y = X, y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def create_dataloaders(train_dataset, val_dataset, batch_size, shuffle=False):
    """
    Creates DataLoader objects for training and validation datasets.

    :param train_dataset: Training dataset of type TimeSeriesDataset.
    :param val_dataset: Validation dataset of type TimeSeriesDataset.
    :param batch_size: Batch size for the DataLoader.
    :param shuffle: Boolean indicating whether to shuffle the dataset.
    :return: Tuple of DataLoader objects for training and validation datasets.
    """
    train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
    val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle)
    return train_dl, val_dl

# Example usage:
batch_size = 12*60  # 1 hour
train_ds = TimeSeriesDataset(X_train, y_train)
val_ds = TimeSeriesDataset(X_val, y_val)

train_dl, val_dl = create_dataloaders(train_ds, val_ds, batch_size=batch_size, shuffle=False)


## Model

In [11]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        """
        Initialize the LSTM model.

        :param input_size: Number of input features.
        :param hidden_size: Number of features in the hidden state of the LSTM.
        :param num_layers: Number of recurrent layers.
        :param output_size: Number of output features (size of output tensor).
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        # Activation function
        self.relu = nn.ReLU()

        # Fully connected layer
        self.fc = nn.Linear(hidden_size * 2, output_size)  # Output size is doubled for bidirectional LSTM
    
    def forward(self, x):
        """
        Forward pass of the LSTM.

        :param x: Input tensor.
        :return: Output tensor.
        """
        batch_size = x.size(0)

        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(x.device)

        # Reshape input to 3D tensor for LSTM
        x = x[:, None, :]  

        # LSTM output
        out, _ = self.lstm(x, (h0, c0))

        # Passing the output through the fully connected layer
        out = self.fc(self.relu(out[:, -1, :])) 

        return out

## Training

In [12]:
def accuracy(outputs, labels):
    """
    Computes the accuracy of the model.

    :param outputs: Model predictions.
    :param labels: Ground truth labels.
    :return: Accuracy as a float.
    """
    predictions = outputs.argmax(dim=1)
    correct = (predictions == labels)
    return correct.float().mean()

def train(epochs, model, loss_func, optimizer, train_loader, valid_loader, device):
    """
    Trains and evaluates the model.

    :param epochs: Number of epochs to train.
    :param model: The neural network model.
    :param loss_func: Loss function.
    :param optimizer: Optimizer.
    :param train_loader: DataLoader for training data.
    :param valid_loader: DataLoader for validation data.
    :param device: Device to run the model on.
    :return: Tuple of final loss and accuracy.
    """
    for epoch in range(epochs):
        # Training phase
        model.train()
        for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/ Training'):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_func(outputs, labels)
            loss.backward()
            optimizer.step()

        # Evaluation phase
        model.eval()
        total_loss, total_acc, count = 0., 0., 0
        with torch.no_grad():
            for inputs, labels in tqdm(valid_loader, desc=f'Epoch {epoch+1}/ Evaluation'):
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                count += len(inputs)
                total_loss += loss_func(outputs, labels).item() * len(inputs)
                total_acc += accuracy(outputs, labels).item() * len(inputs)

        print(f'Epoch: {epoch+1}, Loss: {total_loss/count:.2f}, Accuracy: {total_acc/count:.2f}')

    return total_loss / count, total_acc / count

def predict(model, x, device):
    """
    Makes predictions with the model.

    :param model: The neural network model.
    :param x: Input tensor.
    :param device: Device to run the model on.
    :return: Tuple of softmax scores and predicted values.
    """
    model.eval()
    with torch.no_grad():
        x = x.to(device)
        outputs = model(x)
        scores = F.log_softmax(outputs, -1).exp()
        predictions = scores.argmax(dim=1)

    return scores.cpu(), predictions.cpu()

In [13]:
# Architecture
input_size = len(features)
hidden_size = 32 # like 1 2 4 32 64
num_layers = 2
output_size = 2

model = LSTM(input_size, hidden_size, num_layers, output_size)
opt = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = F.cross_entropy

model.to(def_device)
model

LSTM(
  (lstm): LSTM(18, 32, num_layers=2, batch_first=True, bidirectional=True)
  (relu): ReLU()
  (fc): Linear(in_features=64, out_features=2, bias=True)
)

In [14]:
loss,acc = train(10, model, loss_func, opt, train_dl, val_dl, def_device)

Epoch 1/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 106.85it/s]
Epoch 1/ Evaluation: 100%|██████████| 791/791 [00:07<00:00, 102.40it/s]


Epoch: 1, Loss: 0.18, Accuracy: 0.93


Epoch 2/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 106.40it/s]
Epoch 2/ Evaluation: 100%|██████████| 791/791 [00:07<00:00, 102.60it/s]


Epoch: 2, Loss: 0.15, Accuracy: 0.94


Epoch 3/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 106.23it/s]
Epoch 3/ Evaluation: 100%|██████████| 791/791 [00:07<00:00, 105.44it/s]


Epoch: 3, Loss: 0.14, Accuracy: 0.95


Epoch 4/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 106.29it/s]
Epoch 4/ Evaluation: 100%|██████████| 791/791 [00:07<00:00, 104.92it/s]


Epoch: 4, Loss: 0.13, Accuracy: 0.95


Epoch 5/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 106.64it/s]
Epoch 5/ Evaluation: 100%|██████████| 791/791 [00:07<00:00, 102.85it/s]


Epoch: 5, Loss: 0.13, Accuracy: 0.95


Epoch 6/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 106.77it/s]
Epoch 6/ Evaluation: 100%|██████████| 791/791 [00:07<00:00, 103.52it/s]


Epoch: 6, Loss: 0.13, Accuracy: 0.95


Epoch 7/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 106.85it/s]
Epoch 7/ Evaluation: 100%|██████████| 791/791 [00:07<00:00, 99.16it/s] 


Epoch: 7, Loss: 0.12, Accuracy: 0.96


Epoch 8/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 107.42it/s]
Epoch 8/ Evaluation: 100%|██████████| 791/791 [00:08<00:00, 92.96it/s] 


Epoch: 8, Loss: 0.12, Accuracy: 0.96


Epoch 9/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 108.22it/s]
Epoch 9/ Evaluation: 100%|██████████| 791/791 [00:08<00:00, 95.00it/s] 


Epoch: 9, Loss: 0.12, Accuracy: 0.96


Epoch 10/ Training: 100%|██████████| 3161/3161 [00:29<00:00, 108.51it/s]
Epoch 10/ Evaluation: 100%|██████████| 791/791 [00:08<00:00, 95.43it/s] 

Epoch: 10, Loss: 0.12, Accuracy: 0.96



