In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

# change current working directory to the root of the project
import os
os.chdir(os.path.dirname(os.getcwd()))

# Purpose
- Purpose of this notebook is to train an LSTM model

In [2]:
import warnings
import gc
from IPython.display import display
import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import KFold
import optuna

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Flatten, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.backend import clear_session

from src.pipeline_transformers import ColumnDropperTransformer

# Load Data

In [3]:
nrows = None
X_train = pd.read_csv("data/transformed/X_train.csv", nrows=nrows)
X_val = pd.read_csv("data/transformed/X_val.csv", nrows=nrows)
X_test = pd.read_csv("data/transformed/X_test.csv", nrows=nrows)

y_train = pd.read_csv("data/transformed/y_train.csv", nrows=nrows)
y_val = pd.read_csv("data/transformed/y_val.csv", nrows=nrows)
y_test = pd.read_csv("data/transformed/y_test.csv", nrows=nrows)

display(X_val.tail())
display(y_val.tail())

Unnamed: 0,user_id,timestamp,x,y,z
199293,25,8688572339000,10.53,14.9,14.9
199294,25,8688622327000,3.17,15.7,15.7
199295,25,8688672346000,-1.04,15.79,15.79
199296,25,8688722333000,6.13,10.23,10.23
199297,25,8688772321000,0.08,9.23,9.23


Unnamed: 0,activity
199293,Jogging
199294,Jogging
199295,Jogging
199296,Jogging
199297,Jogging


# Define Parameters

In [4]:
timesteps = 240 # length of each sequence
steps = 40 # step size between sequences
n_features = 3 
n_classes = len(y_train["activity"].unique().tolist())

# Preprocess

In [5]:
# normalize the data and convert back to dataframe
columns = X_train.columns

scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=columns)

# label encode the target variable
encoder = LabelEncoder()
y_train = pd.DataFrame(encoder.fit_transform(y_train), columns=["activity"])
y_val = pd.DataFrame(encoder.transform(y_val), columns=["activity"])
y_test = pd.DataFrame(encoder.transform(y_test), columns=["activity"])

# save the scaler and encoder using joblib
joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(encoder, "models/encoder.pkl")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


['models/encoder.pkl']

In [6]:
encoder.classes_

array(['Downstairs', 'Jogging', 'Sitting', 'Standing', 'Upstairs',
       'Walking'], dtype=object)

In [7]:
def preprocess_data(X: pd.DataFrame, y: pd.DataFrame, n_timesteps: int = 80, steps: int = 40):
    """
    Preprocess data for LSTM model.
    """

    # merge X and y on index
    data = X.merge(y, left_index=True, right_index=True)
    
    # select chunks of 240 timestamps with a step of 40 for each user and activity
    chunks = []
    activity = []
    for index, df in data.groupby(['user_id', 'activity']):
        # select chunks of 240 timestamps with a step of 40
        for i in range(0, len(df), steps):
            chunk = df[i:i+n_timesteps]
            if len(chunk) == n_timesteps:
                chunks.append(chunk[['x', 'y', 'z']].values.reshape(1, n_timesteps, 3))
                activity.append(df['activity'].values[0])

    X = np.concatenate(chunks, axis=0)
    y = np.array(activity)

    return X, y

In [8]:
X_train, y_train = preprocess_data(X_train, y_train, timesteps, steps)
X_val, y_val = preprocess_data(X_val, y_val, timesteps, steps)
X_test, y_test = preprocess_data(X_test, y_test, timesteps, steps)

In [9]:
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_val.shape=}')
print(f'{y_val.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(15203, 240, 3)
y_train.shape=(15203,)
X_val.shape=(4834, 240, 3)
y_val.shape=(4834,)
X_test.shape=(6433, 240, 3)
y_test.shape=(6433,)


# Hyper-parameter Tuning

In [12]:
def objective(trial: optuna.trial.Trial) -> float:
    '''Takes in hyperparameters as input, and trains a model that computes accuracy on the validation set.'''
    clear_session()

    # define early stopping
    early_stop = EarlyStopping(monitor='val_loss',patience=5)

    # define model architecture
    model = Sequential()

    model.add(
        Bidirectional(
        LSTM(
            units=trial.suggest_int("units", 128, 512, log=True),
            input_shape=(timesteps, n_features))))

    model.add(Dropout(rate=trial.suggest_uniform("dropout", 0.0, 0.5)))

    model.add(Dense(units=trial.suggest_int("units", 128, 512, log=True), activation='relu'))

    model.add(Dense(n_classes, activation='softmax'))

    # compile model
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer='adam',
        metrics=["sparse_categorical_accuracy"])
        
    # fit model
    model.fit(
        X_train,
        y_train,
        epochs=20,
        batch_size=trial.suggest_int("batch_size", 32, 128, log=True),
        validation_data=(X_val, y_val),
        shuffle=True,
        callbacks=[early_stop],
        verbose=False,
    )

    # compute validation error
    score = model.evaluate(X_val, y_val, verbose=0)
    
    return score[1]

In [13]:
warnings.filterwarnings('ignore')

# optuna study
study = optuna.create_study(direction="maximize", study_name="lstm")
study.optimize(objective, n_trials=5)

[32m[I 2023-04-04 09:23:44,855][0m A new study created in memory with name: lstm[0m
[32m[I 2023-04-04 09:29:25,789][0m Trial 0 finished with value: 0.6026065349578857 and parameters: {'units': 229, 'dropout': 0.3616264072373436, 'batch_size': 110}. Best is trial 0 with value: 0.6026065349578857.[0m
[32m[I 2023-04-04 09:43:11,703][0m Trial 1 finished with value: 0.7966487407684326 and parameters: {'units': 361, 'dropout': 0.012458431667391856, 'batch_size': 100}. Best is trial 1 with value: 0.7966487407684326.[0m
[32m[I 2023-04-04 09:51:29,354][0m Trial 2 finished with value: 0.566818356513977 and parameters: {'units': 327, 'dropout': 0.32309595017588744, 'batch_size': 68}. Best is trial 1 with value: 0.7966487407684326.[0m
[32m[I 2023-04-04 10:00:01,836][0m Trial 3 finished with value: 0.4077368676662445 and parameters: {'units': 332, 'dropout': 0.22441525582058403, 'batch_size': 67}. Best is trial 1 with value: 0.7966487407684326.[0m
[32m[I 2023-04-04 10:11:09,542][0m

In [14]:
# print best parameters
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'units': 361, 'dropout': 0.012458431667391856, 'batch_size': 100}


# Fit model with best parameters

In [19]:
units = best_params['units']
dropout = best_params['dropout']
batch_size = best_params['batch_size']

In [20]:
clear_session()
# define early stopping
early_stop = EarlyStopping(monitor='val_loss',patience=10)

# define model architecture
best_model = Sequential()

best_model.add(
    Bidirectional(
        LSTM(
            units=units,
            input_shape=(timesteps, n_features))))

best_model.add(Dropout(rate=dropout))

best_model.add(Dense(units=units, activation='relu'))

best_model.add(Dense(n_classes, activation='softmax'))

# compile model
best_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer='adam',
    metrics=["sparse_categorical_accuracy"])
    
# fit model
best_model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=batch_size,
    validation_data=(X_val, y_val),
    shuffle=True,
    callbacks=[early_stop],
    verbose=False,
)

In [None]:
losses = pd.DataFrame(best_model.history.history)
losses[['loss', 'val_loss']].plot()

In [None]:
accuracy = pd.DataFrame(best_model.history.history)
accuracy[['acc', 'val_acc']].plot()

In [None]:
# Evaluate the model accuracy on the test set.
score = best_model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

# Save Predictions & model

In [None]:
# save the model using joblib
joblib.dump(best_model, "models/lstm_model.pkl")

In [None]:
# save predictions
y_pred = best_model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

y_pred = pd.DataFrame(y_pred, columns=["predicted_activity"])
y_pred.to_csv("predictions/predictions_lstm.csv", index=False)

In [None]:
# https://link.springer.com/article/10.1007/s11370-021-00358-7 (CNN + LSTM)
# https://ieeexplore.ieee.org/document/7881728 (CNN)
# https://medium.com/@tanmaychauhan111/human-activity-recognition-using-lstm-cnn-8ccb1a42cb81
# user mcnemars test to compare models