
# Task 2 - LSTM for time series classification

In [0]:
import numpy as np
np.random.seed(1234)

from tensorflow.random import set_seed
set_seed(4321)

import pandas as pd
import re
import os
import pickle

import matplotlib.pyplot as plt
import seaborn as sns


In [0]:
sns.set_style("darkgrid")
sns.set(font_scale=1.3)

## Data import & EDA

In [0]:
# import files stored on Google drive
from google.colab import drive
drive.mount('/content/gdrive')

# OR upload files
# files.upload()

# OR use local data -> edit DATA_DIR

In [0]:
# setup paths to data (and outputs)
ROOT_DIR = 'gdrive/My Drive/Colab Notebooks/Colab data/DMV_1_2'
DATA_DIR = f'{ROOT_DIR}/input'
OUTPUT_DIR = f'/content/{ROOT_DIR}/output'

# specify file name pattern
FILE_NAME_PREFIX = 'Ambient_Living_'
data_file_pattern = FILE_NAME_PREFIX + r'(?P<seq_id>\d+).csv'

# retrieve matching file names
data_files = list(filter(lambda s: re.match(data_file_pattern, s) is not None, os.listdir(DATA_DIR)))
N_FILES = len(data_files)
N_FILES

In [0]:
seq_ids = sorted(int(re.match(data_file_pattern, file_name).group('seq_id')) for file_name in data_files)
print(seq_ids)

In [0]:
def load_file(file_code, strip=False):
    df = pd.read_csv(f'{DATA_DIR}/{FILE_NAME_PREFIX}{file_code}.csv')
    
    if strip:
        df.columns = [c.strip() for c in df.columns]
    
    return df

### Data example

In [0]:
seq_id = seq_ids[0]
data_example = load_file(seq_id)
data_example.head()

In [0]:
data_example.describe()

In [0]:
fig, ax = plt.subplots(figsize=(12,4))
ax = sns.lineplot(ax=ax, data=data_example, dashes=False)
_ = ax.set_title(f"Sequence #{seq_id}")

### Targets - class labels

In [0]:
# load targets (class labels) for available sequences (only those which have been uploaded)
targets = load_file('target', True).iloc[np.array(seq_ids)-1]
targets.head()

In [0]:
targets['class_label'].value_counts()  # number of classes, class labels, number of examples per class

In [0]:
all_labels = ((targets['class_label'].values + 1)/2).astype("int32")

### All data files

In [0]:
# load data for the available sequences
all_data = [load_file(i) for i in seq_ids]


In [0]:
descr = targets['class_label'].to_frame()
descr.index=targets['#sequence_ID']
descr['length'] = [d.shape[0] for d in all_data]
descr['n_features'] = [d.shape[1] for d in all_data]
descr.head()

In [0]:
descr[descr.columns[1:]].describe(percentiles=np.arange(0.2, 1, 0.2))

## Pre-processing
### Adjust sequence lengths to the global 80th quantile

In [0]:
q_cut = 0.8  # cut the length at the 80th quantile
q_cut_value = int(descr['length'].quantile(q_cut))  # get the length corresponding to the quantile
q_cut_value

In [0]:
fig, ax = plt.subplots(figsize=(12,4))

w = 5
ln = descr['length']
labels_set = descr['class_label'].unique()
lengths_classes = [ln[descr['class_label'] == label] for label in labels_set]

hist = ax.hist(lengths_classes,
               bins=range(w*(min(ln)//w), max(ln)+w, w),
               label=[f'class {label} (total: {len(lengths_classes[i])})' for i, label in enumerate(labels_set)],
               stacked=True, rwidth=0.9)

ax.axvline(q_cut_value, color='crimson', linestyle='-.', lw=2, label=f'{100*q_cut:.0f}th quantile ({q_cut_value})')

ax.legend(fancybox=True, framealpha=0.5)
ax.set_xlabel("Length [samples]")
ax.set_ylabel("Count")
plt.show()

In [0]:
# prepare array for the pre-processed data (216x54x4)
all_data_prep = np.zeros((len(all_data), q_cut_value, all_data[0].shape[1]))

# fill the array with data for cropped sequences (the padding to 0 is automatic by non-filling the already zero-filled rows)
for i, sequence in enumerate(all_data):
    all_data_prep[i, :min(q_cut_value, sequence.shape[0]), :] = sequence.iloc[:q_cut_value, :]

In [0]:
fig, ax = plt.subplots(figsize=(12,4))
ax = sns.lineplot(ax=ax, data=all_data_prep[0], dashes=False)
_ = ax.set_title(f"Sequence #{seq_id} (adjusted length)")
_ = ax.legend(data_example.columns)

## Train-test split
Split the data to training and test sets (in 80-20 proportion). Use a stratified split to preserve the proportion of class labels and fix random state for reproducibility.

The (global) training set will further be split 10 times to (particular) train and validation sets during 10-fold cross-validated grid search.

The test set (holdout set) will be used to evaluate some of the cross-validated models.

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
data_train, data_test, labels_train, labels_test = train_test_split(all_data_prep, all_labels, test_size=0.2,
                                                                    random_state=0, shuffle=True, stratify=all_labels)


In [0]:
print(data_train.shape, data_test.shape, labels_train.shape, labels_test.shape)

## LSTM

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import accuracy_score


In [0]:
# a function defining a model to be evaluated (after wrapping by the Keras)
def make_lstm(my_lstm_units, my_lr, **kwargs):
    lstm_model = Sequential()
    lstm_model.add(layers.LSTM(my_lstm_units, input_shape=all_data_prep.shape[1:]))
    lstm_model.add(layers.Dense(1, activation='sigmoid'))
    
    adam = Adam(lr=my_lr)
    lstm_model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    return lstm_model

fit_params = dict(epochs=100, verbose=0)  # same for all models


In [0]:
# define a scoring function for the cross validation - class prediction accuracy
def scorer(estimator, X, y):
    return accuracy_score((estimator.predict(X) > 0.5).astype("int32"), y) 


## Grid search

In [0]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV, KFold

In [5]:
def batch(k, min_batch=16):
    return min_batch * ((N_FILES // (k*min_batch)) or 1)

batch_sizes = [N_FILES, batch(3), batch(10)]
batch_sizes

[216, 64, 16]

In [0]:
# define a parameter grid for the search
param_grid = {'my_lr': [0.0001, 0.001, 0.01],
              'batch_size': batch_sizes,
              'my_lstm_units': [64, 128, 256]}

In [0]:
# define grid search with 10-fold cross-validation
gcv = GridSearchCV(KerasClassifier(make_lstm, **fit_params), param_grid=param_grid,
                   cv=KFold(n_splits=10, shuffle=True, random_state=1),
                   scoring=scorer, n_jobs=-1)
gcv

In [0]:
gcv.fit(data_train, labels_train)

In [0]:
gcv_results = pd.DataFrame(gcv.cv_results_['params'])
codes = gcv_results.columns

gcv_results['mean_fit_time'] = gcv.cv_results_['mean_fit_time']
gcv_results['mean_test_score'] = gcv.cv_results_['mean_test_score']
gcv_results.sort_values('mean_test_score', ascending=False, inplace=True)
gcv_results.reset_index(inplace=True)

gcv_results


In [0]:
gcv_results.to_pickle(f'{OUTPUT_DIR}/gcv_results.pkl')

In [0]:
fig, axes = plt.subplots(3, 1, figsize=(10,8), sharex='all', sharey='all')
for i, ax in enumerate(axes):
    col = gcv_results.columns[i+1]
    vals = gcv_results[col]
    hist = ax.hist(tuple(gcv_results['mean_test_score'].iloc[np.where(vals==i)] for i in set(vals)),
                   bins=10, label=set(vals), stacked=True, rwidth=0.9)
    ax.legend(title=col, loc=2)
    ax.set_ylabel('count')
axes[-1].set_xlabel('mean_test_score')

In [0]:
fig, axes = plt.subplots(3, 1, sharex='all', figsize=(12, 10))
for i, ax in enumerate(axes):
    sns.scatterplot(data=gcv_results, x='mean_test_score', y=codes[i], ax=ax, hue='mean_test_score',
                    palette='hot', legend=None, edgecolor='k', s=100)
  
axes[1].set_yscale('log')

In [0]:
from mpl_toolkits.mplot3d import Axes3D


with sns.plotting_context('paper', font_scale=1.3):
    sns.set_style('whitegrid')

    fig = plt.figure(figsize=(12,6), facecolor=(1, 1, 1))

    ax = fig.add_subplot(111, projection='3d')


    mts = gcv_results['mean_test_score']

    p = ax.scatter(*tuple(gcv_results[code] for code in codes),
                  c=mts, marker='o', cmap='hot', edgecolor='k',
                  s=300*(mts-0.95*mts.min()))
    
    cbar = fig.colorbar(p)
    cbar.set_label('mean test score', rotation=90)


    ax.set_xlabel(codes[0])
    ax.set_ylabel(codes[1])
    ax.set_zlabel(codes[2])

sns.set_style('darkgrid')


## Validation on the test set

In [0]:
def evaluate_model(model_id):

    model_params = dict(gcv_results.loc[model_id, codes])

    for p in ['my_lstm_units', 'batch_size']:
      model_params[p] = int(model_params[p])

    fit_params['batch_size'] = model_params['batch_size']

    model = make_lstm(**model_params)
    model.fit(data_train, labels_train, **fit_params, validation_data=(data_test, labels_test))

    fig, ax = plt.subplots(figsize=(10,4))
    sns.lineplot(data=pd.DataFrame(model.history.history), dashes=False)
    ax.set_title(f"Model {model_id}: {model_params} -> test acc.: {scorer(model, data_test, labels_test):.2f}")
    ax.set_xlabel("Epoch")
    return model


In [0]:
for i in range(5):
    evaluate_model(i)