### Synthetic Minority Oversampling TechniquE (SMOTE)
This notebook has code that tries to use SMOTE (from `imblearn` library) to improve the performance of the classifier

In [9]:
%matplotlib inline
import pandas as pd
import datetime
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import numpy as np
from tsfresh import extract_features
from tsfresh import extract_relevant_features
import sklearn
import sklearn.naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import normalize
from imblearn.over_sampling import SMOTE
from collections import Counter

#### Extracting features using `tsfresh`

In [3]:
data = pd.read_csv('../data/data_only_tsfresh_compatible.csv', names = ['x_acc', 'y_acc', 'z_acc', 'id'])
labels = pd.read_csv('../data/labels_only.csv', names = ['Blocking', 'Dodging', 'Inactive', 'Moving', 'Sprinting'])

In [4]:
extracted_features = extract_features(data, column_id = "id", column_sort = None, column_kind = None, column_value = None)
print(extracted_features.shape)

Feature Extraction: 100%|██████████| 10/10 [06:13<00:00, 33.01s/it]


(1068, 2382)


In [5]:
label_arr = labels.values
label_arr = np.argmax(label_arr, axis = 1)

y_features = np.zeros(extracted_features.shape[0])
for i in range(len(label_arr)) : 
    if i % 150 == 0 : 
        y_features[i // 150] = label_arr[i]
        
# Also converting into Pandas Series for use in extracting relevant features using tsfresh
y = pd.Series(y_features, dtype = int)

In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y)

In [20]:
x_features = np.asarray(extracted_features)
print(x_features.shape)
x_features_relevant = np.asarray(features_filtered)
print(x_features_relevant.shape)
# Gives the number of examples per label
print(Counter(y_features))

(1068, 2382)
(1068, 693)
Counter({3.0: 411, 2.0: 213, 4.0: 196, 0.0: 129, 1.0: 119})


#### Shuffle and split into train/test datasets, and normalize the datasets

In [24]:
x_train, x_test, y_f, y_test = train_test_split(x_features, y_features)
print(x_train.shape)
print(x_test.shape)
print(y_f.shape)
print(y_test.shape)
x_f = normalize(x_train)
x_test_norm = normalize(x_test)
print(Counter(y_f))

(801, 2382)
(267, 2382)
(801,)
(267,)
Counter({3.0: 300, 2.0: 158, 4.0: 145, 0.0: 106, 1.0: 92})


#### Use Synthetic Minority Oversampling to equalize all classes

In [25]:
sm = SMOTE(random_state = 33)
x_train_norm, y_train = sm.fit_resample(x_f, y_f)
print(Counter(y_train))

Counter({3.0: 300, 4.0: 300, 0.0: 300, 2.0: 300, 1.0: 300})


In [31]:
svm_lin = RandomForestClassifier(n_estimators = 300, class_weight = 'balanced')
svm_lin.fit(x_train_norm, y_train)
y_pred = svm_lin.predict(x_test_norm)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_pred = svm_lin.predict(x_train_norm)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[ 0  3  7  9  4]
 [ 0  2  3 14  8]
 [ 3  2 26 16  8]
 [ 6 10 17 56 22]
 [ 2  6  4 12 27]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        23
         1.0       0.09      0.07      0.08        27
         2.0       0.46      0.47      0.46        55
         3.0       0.52      0.50      0.51       111
         4.0       0.39      0.53      0.45        51

    accuracy                           0.42       267
   macro avg       0.29      0.32      0.30       267
weighted avg       0.40      0.42      0.40       267

[[300   0   0   0   0]
 [  0 299   0   1   0]
 [  0   0 298   2   0]
 [  0   1   1 298   0]
 [  0   1   1   1 297]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       300
         1.0       0.99      1.00      1.00       300
         2.0       0.99      0.99      0.99       300
         3.0       0.99      0.99      0.99       300
         4.0       1.00      0.99

Even using SMOTE did not improve the test accuracy, neither did it significantly reduce the bias due to class unbalance. So, now another option is to repeat the above feature extraction and class balancing on a running window based preprocessing of the data rather than the raw data.
### TODO - HIGH-PRIORITY