# ECE 542 competition

Click to the link to get more detail: https://research.ece.ncsu.edu/aros/paper-tase2020-lowerlimb/

## I. Data Processing

In [None]:
""" **Description**
        ECE 542 Competition is the competition project for the NCSU ECE 542 Neural
        Networks course in the 2021 Fall semester.

    **License**
        © 2021 - 2021 Khoa Do, Duy Nguyen, Larry Turner. All rights reserved.

    **Author**
        Khoa Do.
"""

import torch
import numpy as np

# Checking if CUDA is available
flag_cuda = torch.cuda.is_available()

if not flag_cuda:
    print('Using CPU')
else:
    print('Using GPU')

### I.2. Data Loader

In [None]:
import re
import pandas
import glob
import numpy

In [None]:
class Data():

    """ Data.  Load and aggregate data into a collection of data frames.
    """

    def __init__(self):

        self.TEST = '../data/test'
        self.TRAINING = '../data/training'

        self.frequency = 0
        self.test = [] 
        self.training = [] 

    def run(self):
        """ Initialize.
        """

        for index__, path in enumerate(sorted(set(x.split('__')[0] for x in glob.glob(f'{self.TRAINING}/*__*.csv')))):
            print(f"Processs file: {path}")
            # get sample and time
            x = pandas.read_csv(f'{path}__x.csv', names=(
                'ax', 'ay', 'az', 'gx', 'gy', 'gz'))
            t = pandas.read_csv(f'{path}__x_time.csv', names=('time', ))
            x.insert(0, 'time', t)

            # get subject_id and series_id from path and insert to self
            [subject_id__, series_id__] = [
                int(s) for s in re.findall(r'-?\d+\.?\d*', path)]
            subject_id = [subject_id__] * len(t)
            series_id = [series_id__] * len(t)
            index = [index__] * len(t)
            x.insert(0, 'subject_id', subject_id)
            x.insert(0, 'series_id', series_id)
            x.insert(0, 'file_id', index)
            # get frequency of signal
            if (numpy.isclose(self.frequency, 0.0)):
                self.frequency = numpy.round(
                    1.0 / numpy.mean(x.time[1:].values - x.time[0: -1].values))
            
            y = pandas.read_csv(f'{path}__y.csv', names=('label', ))
            t = pandas.read_csv(f'{path}__y_time.csv', names=('time', ))
            y.insert(0, 'time', t)
            x.insert(x.shape[1], 'label', y.label[len(y) - 1])
            ii = 0
            with pandas.option_context('mode.chained_assignment', None):
                for jj in range(0, len(x)):
                    x.label[jj] = y.label[ii]
                    while ((ii < (len(y) - 1)) and (x.time[jj] >= y.time[ii + 1])):
                        ii += 1
            self.training.append(x)

        # Test.
        for index__, path in enumerate(sorted(set(x.split('__')[0] for x in glob.glob(f'{self.TEST}/*__*.csv')))):
            print(f"Processs file: {path}")
            x = pandas.read_csv(f'{path}__x.csv', names=(
                'ax', 'ay', 'az', 'gx', 'gy', 'gz'))
            t = pandas.read_csv(f'{path}__x_time.csv', names=('time', ))
            x.insert(0, 'time', t)
            x.insert(x.shape[1], 'label', -1)

            # get subject_id and series_id from path and insert to self
            [subject_id__, series_id__] = [
                int(s) for s in re.findall(r'-?\d+\.?\d*', path)]
            subject_id = [subject_id__] * len(t)
            series_id = [series_id__] * len(t)
            index = [index__] * len(t)

            x.insert(0, 'subject_id', subject_id)
            x.insert(0, 'series_id', series_id)
            x.insert(0, 'file_id', index)
            self.test.append(x)

In [None]:
dataObj = Data()
dataObj.run()

In [None]:
# view the training data
dataObj.training[0].head()
#print(dataObj.training[0].head())

In [None]:
# view the test data
dataObj.test[0].head()

### [*] Run the bellow code to save/load data as csv file

Since Google Colab is sooooo unstable, better save the preprocessed data into files and load them later for following steps

In [None]:
# SAVE THE DATA IN UNIQUE CSV FILE
path_data_training = "../data/output/data-training.csv"
path_data_test = "../data/output/data-test.csv"

for i, df in enumerate(dataObj.training):
    if i:
        df.to_csv(path_data_training, mode='a', header=False, index=False)
    else: 
        df.to_csv(path_data_training, mode='w', header=True, index=False)

for i, df in enumerate(dataObj.test):
    if i:
        df.to_csv(path_data_test, mode='a', header=False, index=False)
    else: 
        df.to_csv(path_data_test, mode='w', header=True, index=False)

In [None]:
# run the code to LOAD THE DATA from csv file
path_data_training = "../data/output/data-training.csv"
path_data_test = "../data/output/data-test.csv"

dataObjBackUp = Data()
data_training = pandas.read_csv(path_data_training)  
for file_id in set(data_training["file_id"].values):
    data_training_part = data_training[data_training.file_id == file_id].sort_values(by=['time'])
    dataObjBackUp.training.append(data_training_part)
    
data_test = pandas.read_csv(path_data_test)  
for file_id in set(data_test["file_id"].values):
    data_test_part = data_test[data_test.file_id == file_id].sort_values(by=['time'])
    dataObjBackUp.test.append(data_test_part)

In [None]:
dataObjBackUp.training[0].head()

In [None]:
dataObjBackUp.test[0].head()

### I.3. Apply Lowpass filter

In [None]:
from diamondback import IirFilter
from sklearn.ensemble import RandomForestClassifier
from typing import Dict, List, Tuple, Union
import glob
import numpy
import os
import pandas
import random

In [None]:
class Filter() :

    """ Filter.  Filter and transform data frames.
    """
    def __init__(self):
      self.FREQUENCY = 0.7
      self.frequency : float = 40
      self.test : List[pandas.DataFrame] = []
      self.training : List[pandas.DataFrame] = []

    
    def run(self, Data) -> None :

        """ Initialize.
        """
        # run in the google colab
        iir = IirFilter(style = 'Butterworth', frequency = self.FREQUENCY, order = 2)
        
        # run in local
        # iir = IirFilter.Factory.instance(IirFilter, 'Butterworth', frequency = self.FREQUENCY, order = 2)
        
        delay = int(numpy.round(numpy.mean(iir.delay(16)[0])))
        title = [u for u in Data.training[0].columns if (u not in ('file_id', 'subject_id', 'series_id', 'time', 'label'))]
        gain = dict(zip(title, [0.2] * 3 + [1.0] * 3))

        # Training
        self.training = []
        for i, x in enumerate(Data.training):
            print(f"Process Training file_id {i}")
            with pandas.option_context('mode.chained_assignment', None):
                y = x.copy( )
                for ii in [u for u in title]:
                    v = iir.filter((y[ii] - numpy.mean(y[ii])) * gain[ii])
                    y[ii] = numpy.concatenate((v[delay:], [v[-1]] * delay))
                self.training.append(y)

        # Test.
        self.test = []
        for i, x in enumerate(Data.test):
            print(f"Process Testing file_id {i}")
            with pandas.option_context( 'mode.chained_assignment', None ) :
                y = x.copy( )
                for ii in [u for u in title]:
                    v = iir.filter((y[ii] - numpy.mean(y[ii])) * gain[ii])
                    y[ii] = numpy.concatenate((v[delay:], [v[-1]] * delay))
                self.test.append(y)

In [None]:
filterObj = Filter()
filterObj.run(dataObj)
#filterObj.run(dataObjBackUp)

In [None]:
filterObj.training[0].head()

In [None]:
filterObj.test[0].head()

### [*] Run the bellow code to save/load filtered data to/from csv file

In [None]:
# save the data as csv file
path_filter_training = "../data/output/filter-training.csv"
path_filter_test = "../data/output/filter-test.csv"

for i, df in enumerate(filterObj.training):
    if i:
        df.to_csv(path_filter_training, mode='a', header=False, index=False)
    else: 
        df.to_csv(path_filter_training, mode='w', header=True, index=False)

for i, df in enumerate(filterObj.test):
    if i:
        df.to_csv(path_filter_test, mode='a', header=False, index=False)
    else: 
        df.to_csv(path_filter_test, mode='w', header=True, index=False)

In [None]:
# run the code to LOAD THE DATA from csv file
path_filter_training = "../data/output/filter-training.csv"
path_filter_test = "../data/output/filter-test.csv"

filterObjBackUp = Filter()
filter_training = pandas.read_csv(path_filter_training)  
for file_id in set(filter_training["file_id"].values):
    filter_training_part = filter_training[filter_training.file_id == file_id].sort_values(by=['time'])
    filterObjBackUp.training.append(filter_training_part)
    
filter_test = pandas.read_csv(path_filter_test)  
for file_id in set(filter_training["file_id"].values):
    filter_test_part = filter_test[filter_test.file_id == file_id].sort_values(by=['time'])
    filterObjBackUp.test.append(filter_test_part)

## II. Extract features from data

In [None]:
class Features() :

    """ Features.  Extract features as windowed properties, moments, and norms, into a data frame.
    """

    def __init__(self):
    
        self.FEATURES = 1
        self.WINDOW = 4

        self.test : List[pandas.DataFrame] = []
        self.training : List[pandas.DataFrame] = []

    
    def run(self, Filter):

        """ Initialize.
        """

        title = [ u for u in Filter.training[0].columns if (u not in ('file_id', 'subject_id', 'series_id', 'time', 'label'))]

        # Training.
        # self.training = [ ]
        for x in Filter.training :
            file_id_list = set(x["file_id"].values[:])
            assert len(file_id_list) == 1, "[Error] The File ID is not unique"

            file_id__ = x["file_id"].values[0]
            subject_id__ = x["subject_id"].values[0]
            series_id__ = x["series_id"].values[0]               
            print("------------")
            print(f"file_id: {file_id__}, subject_id: {subject_id__}, series_id: {series_id__}")           
            with pandas.option_context( 'mode.chained_assignment', None ) :
                y = pandas.DataFrame( columns = [ 'file_id', 'subject_id', 'series_id', 'time' ] + [ f'{u}{ii}' for u in title for ii in range( 0, self.FEATURES ) ] + [ 'label' ])
                actual_len = round(len(x) / self.WINDOW) * self.WINDOW
                for ii in range(0, actual_len, self.WINDOW ) :
                    try:
                        z = x[ ii : ii + self.WINDOW ]
                    except:
                        z = x[ ii : ]
                    assert len(set(z["label"].values[:])), "[Error] Re-sampling is not OK"

                    feature = [file_id__, subject_id__, series_id__, z["time"].values[0]]
                    for v in [ np.array( z[u] ) for u in title ] :
                        u = np.mean( v )
                        feature += [u]
                    feature += [z["label"].values[0]]
                    y.loc[len(y)] = feature
                y = y.astype({'file_id': 'int64', 
                                'subject_id': 'int64',
                                'series_id': 'int64',
                                'label' : 'int64'})                 
                self.training.append(y)

        # Test.
        self.test = []
        for x in Filter.test :
            file_id_list = set(x["file_id"].values[:])
            assert len(file_id_list) == 1, "[Error] The File ID is not unique"
            
            file_id__ = x["file_id"].values[0]
            subject_id__ = x["subject_id"].values[0]
            series_id__ = x["series_id"].values[0]             
            print("------------")
            print(f"file_id: {file_id__}, subject_id: {subject_id__}, series_id: {series_id__}")           
            with pandas.option_context( 'mode.chained_assignment', None ) :
                y = pandas.DataFrame( columns = [ 'file_id', 'subject_id', 'series_id', 'time' ] + [ f'{u}{ii}' for u in title for ii in range( 0, self.FEATURES ) ] + [ 'label' ])
                
                actual_len = round(len(x) / self.WINDOW) * self.WINDOW
                for ii in range(0, actual_len, self.WINDOW ) :
                    try:
                        z = x[ ii : ii + self.WINDOW ]
                    except:
                        z = x[ ii : ]
                    assert len(set(z["label"].values[:])), "[Error] Re-sampling is not OK"

                    feature = [file_id__, subject_id__, series_id__, z["time"].values[0]]
                    for v in [ np.array( z[u] ) for u in title ] :
                        u = np.mean( v )
                        feature += [u]
                    feature += [z["label"].values[0]]
                    y.loc[len(y)] = feature
                y = y.astype({'file_id': 'int64', 
                                'subject_id': 'int64',
                                'series_id': 'int64',
                                'label' : 'int64'})             
                self.test.append(y) 
        

In [None]:
featureObj = Features()
featureObj.run(filterObj)

In [None]:
print(f"[*] Number of sample original data: {len(filterObj.training[8])}")
print(f"[*] Number of sample down sampling data: {len(featureObj.training[8])}")

### [*] Run the bellow code to get save/load features (downsampling data) to/from csv files

In [None]:
# save the data as csv file
path_features_training = "../data/output/features-training.csv"
path_features_test = "../data/output/features-test.csv"

for i, df in enumerate(featureObj.training):
    if i:
        df.to_csv(path_features_training, mode='a', header=False, index=False)
    else: 
        df.to_csv(path_features_training, mode='w', header=True, index=False)

for i, df in enumerate(featureObj.test):
    if i:
        df.to_csv(path_features_test, mode='a', header=False, index=False)
    else: 
        df.to_csv(path_features_test, mode='w', header=True, index=False)

In [None]:
# run the code to LOAD THE DATA from csv file
path_features_training = "../data/output/features-training.csv"
path_features_test = "../data/output/features-test.csv"

featureObjBackUp = Features()
feature_training = pandas.read_csv(path_features_training)  
for file_id in set(feature_training["file_id"].values):
    feature_training_part = feature_training[feature_training.file_id == file_id].sort_values(by=['time'])
    featureObjBackUp.training.append(feature_training_part)
    
feature_test = pandas.read_csv(path_features_test)  
for file_id in set(feature_training["file_id"].values):
    feature_test_part = feature_test[feature_test.file_id == file_id].sort_values(by=['time'])
    featureObjBackUp.test.append(feature_test_part)

# III. Model

In [None]:
%matplotlib inline

In [None]:
from multiprocessing import cpu_count
from pathlib import Path

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import _LRScheduler

In [None]:
seed = 1
np.random.seed(seed)
#torch.cuda.set_device(0)  # if you have more than one CUDA device

### III.1. Data Preprocessing

#### Load data

In [None]:
# run the code to LOAD THE DATA from csv file
path_features_training = "../data/output/features-training.csv"
path_features_test = "../data/output/features-test.csv"

feature_training = pandas.read_csv(path_features_training)  
TRAIN = []
for file_id in set(feature_training["file_id"].values):
    feature_training_part = feature_training[feature_training.file_id == file_id].sort_values(by=['time'])
    TRAIN.append(feature_training_part)
    
feature_test = pandas.read_csv(path_features_test)  
TEST = []
for file_id in set(feature_training["file_id"].values):
    feature_test_part = feature_test[feature_test.file_id == file_id].sort_values(by=['time'])
    TEST.append(feature_test_part)

In [None]:
TRAIN[0].head()

In [None]:
TEST[0].head()

### Examine the class label imbalance

In [None]:
SIZE_WINDOW = 16

In [None]:
x_titles = [ii for ii in TRAIN[0].keys() if ii not in ['index', 'file_id', 'subject_id', 'series_id', 'time', 'measurements', 'label']]
x_train = [train__[x_titles] for train__ in TRAIN]
x_train[0].head()

In [None]:
y_titles = [ii for ii in TRAIN[0].keys() if ii in ['label']]
y_train = [train__[y_titles].iloc[SIZE_WINDOW:] for train__ in TRAIN]
y_train[0].head()

In [None]:
def create_dataset(x_train, y_train, valid_size = 0.2):
    
    y_train_list = []
    x_dataset_list = []
    for id_file in range(len(x_train)):
        x_train__ = x_train[id_file]
        y_train__ = y_train[id_file]

        assert len(x_train__) == len(y_train__) + SIZE_WINDOW, "The x_train and y_train is not match"
        # print("aaaaa")        
        for id_measurement in range(len(x_train__)):
            if id_measurement < SIZE_WINDOW:
                continue        
            x_train_sample__ = x_train__.iloc[id_measurement - SIZE_WINDOW: id_measurement].to_numpy().reshape(16, 6)
            x_dataset_list.append(x_train_sample__)
        # print("bbbbb")        
        y_train_list.extend(y_train__["label"].values)
    # x_dataset = np.array(x_dataset_list)
    x_dataset = x_dataset_list
    # print("ccccc")        
    enc = LabelEncoder()
    y_dataset = enc.fit_transform(y_train_list)

    
    assert len(x_dataset) == len(y_dataset), "The x_dataset and y_dataset is not match"
    X_train, X_valid, y_train, y_valid = train_test_split(x_dataset, y_dataset, test_size=valid_size)
    X_train, X_valid = [torch.tensor(arr, dtype=torch.float32) for arr in (X_train, X_valid)]
    y_train, y_valid = [torch.tensor(arr, dtype=torch.long) for arr in (y_train, y_valid)]
    train_ds = TensorDataset(X_train, y_train)
    valid_ds = TensorDataset(X_valid, y_valid)
    return train_ds, valid_ds, enc

def create_loaders(train_ds, valid_ds, bs=512, jobs=0):
    train_dl = DataLoader(train_ds, bs, shuffle=True, num_workers=jobs)
    valid_dl = DataLoader(valid_ds, bs, shuffle=False, num_workers=jobs)
    return train_dl, valid_dl


def accuracy(output, target):
    return (output.argmax(dim=1) == target).float().mean().item()

In [None]:
train_ds, valid_ds, enc = create_dataset(x_train, y_train)

In [None]:
train_ds[0]

In [None]:
bs = 128
print(f'Creating data loaders with batch size: {bs}')
trn_dl, val_dl = create_loaders(train_ds, valid_ds, bs, jobs=cpu_count())

### III.2 LSTM

#### Cyclic Learning Rate

The recent papers by L. Smith show us that the cyclic learning rate schedulers have very positive influence on model's convergence speed. In the following cells, we implement a simple cosine scheduler for our model.

In [None]:
class CyclicLR(_LRScheduler):
    
    def __init__(self, optimizer, schedule, last_epoch=-1):
        assert callable(schedule)
        self.schedule = schedule
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        return [self.schedule(self.last_epoch, lr) for lr in self.base_lrs]

In [None]:
def cosine(t_max, eta_min=0):
    
    def scheduler(epoch, base_lr):
        t = epoch % t_max
        return eta_min + (base_lr - eta_min)*(1 + np.cos(np.pi*t/t_max))/2
    
    return scheduler

In [None]:
n = 100
sched = cosine(n)
lrs = [sched(t, 1) for t in range(n * 4)]
plt.plot(lrs)

#### The LSTM Model

Our classifier contains of several LSTM cells (hidden under the hood of `nn.LSTM`),  and one `nn.Linear` layer. Note that we use `batch_first=True` to make sure that the first dimension of our tensors is interpreted as a batch size, and the next one - as a time dimension.

In [None]:
## HOLD ON TO RUNNING THIS CELL

import torch
import numpy as np

# Checking if CUDA is available
flag_cuda = torch.cuda.is_available()

In [None]:
class LSTMClassifier(nn.Module):
    """Very simple implementation of LSTM-based time-series classifier."""
    
    def __init__(self, input_size, hidden_size, layers, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.layers = layers
        self.lstm = nn.LSTM(input_size, hidden_size, layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.15)
        self.actv = nn.Tanh()

    
    def forward(self, x):
        h0, c0 = self.init_hidden(x)
        lstm_out, _,  = self.lstm(x, (h0, c0))
        output = self.fc(self.actv(self.dropout(lstm_out[:, -1, :])))
        return output
    
    def init_hidden(self, x):
        if not flag_cuda:
            return [t for t in (torch.zeros(self.layers, x.size(0), self.hidden_size), torch.zeros(self.layers, x.size(0), self.hidden_size))]
        else:
            return [t.cuda() for t in (torch.zeros(self.layers, x.size(0), self.hidden_size), torch.zeros(self.layers, x.size(0), self.hidden_size))]

#### Training Loop

Finally, we are ready to bring everything together and train the model.

In [None]:
input_dim = 6  
hidden_dim = 256
layer_dim = 3
output_dim = 4
seq_dim = 16

lr = 0.0001
n_epochs = 150
iterations_per_epoch = len(trn_dl)
best_acc = 0
patience, trials = 300, 0

model = LSTMClassifier(input_dim, hidden_dim, layer_dim, output_dim)
model = model.cuda()
criterion = nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=lr)
sched = CyclicLR(opt, cosine(t_max=iterations_per_epoch * 2, eta_min=lr/100))

print('Start model training')


train_loss_list = []
valid_loss_list = []
for epoch in range(1, n_epochs + 1):
    
    train_loss = 0.0
    valid_loss = 0.0

    for i, (x_batch, y_batch) in enumerate(trn_dl):
        model.train()
        x_batch = x_batch.cuda()
        y_batch = y_batch.cuda()
        out = model(x_batch)
        loss = criterion(out, y_batch)
        opt.zero_grad()
        loss.backward()
        opt.step()
        sched.step()
        
        train_loss += loss.item() * x_batch.size(0)
    train_loss_list.append(train_loss)
    
    model.eval()
    correct, total = 0, 0
    for x_val, y_val in val_dl:
        x_val, y_val = [t.cuda() for t in (x_val, y_val)]
        out = model(x_val)
        preds = F.log_softmax(out, dim=1).argmax(dim=1)
        total += y_val.size(0)
        correct += (preds == y_val).sum().item()
    
        loss = criterion(out, y_val)
        valid_loss += loss.item()*x_val.size(0)
    valid_loss_list.append(valid_loss)

    acc = correct / total

    # Printing training/validation statistics 
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
      epoch, train_loss, valid_loss))

    if acc > best_acc:
        trials = 0
        best_acc = acc
        torch.save(model.state_dict(), f'../data/output/model/best_{best_acc:2.2%}.pth')
        print(f'Best model saved with accuracy: {best_acc:2.2%}')
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch}')
            break
print("-----------------------\nTraining is done!!")


path_training_loss = "../data/output/training_loss_modify.csv"
np.savetxt(path_training_loss, 
          [str(train_loss) for train_loss in train_loss_list], 
           delimiter ="\n",  
           fmt ='% s') 
path_valid_loss = "../data/output/valid_loss_modify.csv"
np.savetxt(path_valid_loss, 
          [str(valid_loss) for valid_loss in valid_loss_list], 
           delimiter ="\n",  
           fmt ='% s') 

In [None]:
from numpy import genfromtxt
from matplotlib import pyplot as plt

path_training_loss = "../data/output/training_loss_modify.csv"
path_valid_loss = "../data/output/valid_loss_modify.csv"

train_loss_load = genfromtxt(path_training_loss, delimiter ="\n")
valid_loss_load = genfromtxt(path_valid_loss, delimiter ="\n")

# Plotting the learning curves
epochs_list = range(1, n_epochs + 1)
plt.plot(epochs_list, train_loss_load, epochs_list, valid_loss_load)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(['Training','Validation'])
plt.title("Performance of LSTM model. Training loss vs. Validation loss")
# plt.show()

plt.savefig('learning_curve.png')

### Model Evaluation

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt


model = LSTMClassifier(input_dim, hidden_dim, layer_dim, output_dim)
model = model.cuda()
model.load_state_dict(torch.load("../data/output/model/best_99.39%.pth"))
model.eval()

nb_classes = 4
confussion_matrix = torch.zeros(nb_classes, nb_classes)
with torch.no_grad():
    for x_val, y_val in val_dl:
        x_val, y_val = [t.cuda() for t in (x_val, y_val)]
        out = model(x_val)
        preds = F.log_softmax(out, dim=1).argmax(dim=1)
        # get accuracy    
        total += y_val.size(0)
        correct += (preds == y_val).sum().item()
    
        # get confussion matrix
        for t, p in zip(y_val.view(-1), preds.view(-1)):
            confussion_matrix[t.long(), p.long()] += 1  

print(f"Accuracy: {correct / total}%")
print(f"Confussion Matrix")
plt.figure(figsize = (10,7))
sns_plot = sns.heatmap(confussion_matrix, annot=True, cmap =sns.cm.rocket_r,linecolor='white', linewidths=1)
plt.savefig("confussion_matrix.png", dpi=400)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import _LRScheduler
import pandas

In [None]:
seed =  1
np.random.seed(seed)
#torch.cuda.set_device(0)  # if you have more than one CUDA device

In [None]:
# run the code to LOAD THE DATA from csv file
path_features_test = "../data/output/features-test.csv"

feature_test = pandas.read_csv(path_features_test)  
TEST = []
for file_id in set(feature_test["file_id"].values):
    feature_test_part = feature_test[feature_test.file_id == file_id].sort_values(by=['time'])
    TEST.append(feature_test_part)

In [None]:
SIZE_WINDOW = 16

In [None]:
class LSTMClassifier(nn.Module):
    """Very simple implementation of LSTM-based time-series classifier."""
    
    def __init__(self, input_size, hidden_size, layers, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.layers = layers
        self.lstm = nn.LSTM(input_size, hidden_size, layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.15)
        self.actv = nn.Tanh()

    
    def forward(self, x):
        h0, c0 = self.init_hidden(x)
        lstm_out, _,  = self.lstm(x, (h0, c0))
        output = self.fc(self.actv(self.dropout(lstm_out[:, -1, :])))
        return output
    
    def init_hidden(self, x):
        if not flag_cuda:
            return [t for t in (torch.zeros(self.layers, x.size(0), self.hidden_size), torch.zeros(self.layers, x.size(0), self.hidden_size))]
        else:
            return [t.cuda() for t in (torch.zeros(self.layers, x.size(0), self.hidden_size), torch.zeros(self.layers, x.size(0), self.hidden_size))]

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt

input_dim = 6  
hidden_dim = 256
layer_dim = 3
output_dim = 4
seq_dim = 16


model = LSTMClassifier(input_dim, hidden_dim, layer_dim, output_dim)
model = model.cuda()
model.load_state_dict(torch.load("../data/output/model/best_99.39%.pth")) ### Change this appropciately to the path to your best model
model.eval()

In [None]:
from collections import Counter

# setting parameters for post-process
FILTER_OUTPUT = True
SLIDING_WINDOW = 21
ACCEPT_THRESHOLD_ZEROS = 55 # 55%
ACCEPT_THRESHOLD_OTHERS = 50 # 50%


# Apply post-process to get the prediction bettet. Change FILTER_OUTPUT = True
def post_process(label_file__, NUM_INTERATION = 1): 
    label_file = label_file__.copy()
    for i in range(NUM_INTERATION):
        for i in range(len(label_file)):
            if i < SLIDING_WINDOW//2:
                continue
            label_list = label_file[i-SLIDING_WINDOW//2 : i + SLIDING_WINDOW//2]
            counter = Counter(label_list)
            keys = list(counter.keys())
            values = list(counter.values())
            
            if len(keys) == 0:
                continue

            max_key = keys[values.index(sorted(values)[-1])]
            # max_second_key = keys[values.index(sorted(values)[-2])]
            # not fair between label 0 and label 1, 2, 3
            if label_file[i] != max_key:
                if max_key == 0 and int(counter[max_key]/SLIDING_WINDOW*100) > ACCEPT_THRESHOLD_ZEROS or \
                    max_key != 0 and int(counter[max_key]/SLIDING_WINDOW*100) > ACCEPT_THRESHOLD_OTHERS:
                    label_file[i] = max_key
                    
    return label_file

In [None]:
""""
import csv

x_titles = [ii for ii in TEST[0].keys() if ii not in ['index', 'file_id', 'subject_id', 'series_id', 'time', 'measurements', 'label']]

label_files = []
for test__ in TEST:
    
    subject = test__['subject_id'].values[0]
    path_file_csv = "../data/output/subject_{:03d}_01__y.csv".format(subject) # change this to the output folder as you will
    print(path_file_csv)
    
    x_test__ = test__[x_titles]
    #print(x_test__.head())

    label_list = []
    for id_measurement in range(len(x_test__)):
        if id_measurement < SIZE_WINDOW:
            label = 0
        else:
            x_test_sample__ = x_test__.iloc[id_measurement - SIZE_WINDOW: id_measurement].to_numpy().reshape(16, 6)
            x_test_sample_tensor = torch.tensor(x_test_sample__, dtype=torch.float32).unsqueeze(0).cuda()
            # x_test_sample_tensor = torch.tensor(x_test_sample__, dtype=torch.float32).unsqueeze(0)
            output = model(x_test_sample_tensor)
            label = F.log_softmax(output, dim=1).argmax(dim=1).item()
        label_list.append(label)
    label_files.append(label_list)
    
    
    with open(path_file_csv, 'w') as myfile:
        wr = csv.writer(myfile, delimiter=',')
        wr.writerow(label_list)
""""

In [None]:
import csv

x_titles = [ii for ii in TEST[0].keys() if ii not in ['index', 'file_id', 'subject_id', 'series_id', 'time', 'measurements', 'label']]

label_files = []
for test__ in TEST:
    
    subject = test__['subject_id'].values[0]
    
    x_test__ = test__[x_titles]

    label_list = []
    for id_measurement in range(len(x_test__)):

        if id_measurement < SIZE_WINDOW or  id_measurement > len(x_test__) - 2 - SIZE_WINDOW:
            label = 0
        else:
            x_test_sample__ = x_test__.iloc[id_measurement - SIZE_WINDOW: id_measurement].to_numpy().reshape(16, 6)
            if flag_cuda:
                x_test_sample_tensor = torch.tensor(x_test_sample__, dtype=torch.float32).unsqueeze(0).cuda()
            else:
                x_test_sample_tensor = torch.tensor(x_test_sample__, dtype=torch.float32).unsqueeze(0)
            output = model(x_test_sample_tensor)
            label = F.log_softmax(output, dim=1).argmax(dim=1).item()
        label_list.append(label)

    if FILTER_OUTPUT:
        label_list__ = post_process(label_list)
        # path_file_csv = os.path.join(PATH_RESULT, "subject_{:03d}_01__y_filter_{}_{}.csv".format(subject, ACCEPT_THRESHOLD_OTHERS, ACCEPT_THRESHOLD_ZEROS))
        path_file_csv = "../data/output/subject_{:03d}_01__y.csv".format(subject)
    else:
        label_list__ = label_list
        path_file_csv = "../data/output/subject_{:03d}_01__y.csv".format(subject)
    print(path_file_csv)
    with open(path_file_csv, 'w') as myfile:
        wr = csv.writer(myfile, delimiter=',')
        for label_list___ in label_list__:
            wr.writerow([label_list___])
    label_files.append(label_list__)

In [None]:
subject9 = label_files[0]
print(len(subject9))

In [None]:
x = np.arange(0, 9498, 1)
plt.figure()
plt.plot(x, subject9)
plt.show()

In [None]:
subject10 = label_files[1]
print(len(subject10))

In [None]:
x = np.arange(0, 12270, 1)
plt.figure()
plt.plot(x, subject10)
plt.show()

In [None]:
subject11 = label_files[2]
print(len(subject11))

In [None]:
x = np.arange(0, 12940, 1)
plt.figure()
plt.plot(x, subject11)
plt.show()

In [None]:
subject12 = label_files[3]
print(len(subject12))

In [None]:
x = np.arange(0, 11330, 1)
plt.figure()
plt.plot(x, subject12)
plt.show()