In [1]:
%matplotlib inline
import pandas as pd
import datetime
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tsfresh import extract_features
from tsfresh import extract_relevant_features
import sklearn
import sklearn.naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import normalize

### Feature Extraction

In [2]:
data = pd.read_csv('../data/data_only_tsfresh_compatible.csv', names = ['x_acc', 'y_acc', 'z_acc', 'id'])
labels = pd.read_csv('../data/labels_only.csv', names = ['Blocking', 'Dodging', 'Inactive', 'Moving', 'Sprinting'])

In [3]:
extracted_features = extract_features(data, column_id = "id", column_sort = None, column_kind = None, column_value = None)

Feature Extraction: 100%|██████████| 10/10 [06:04<00:00, 31.46s/it]


In [9]:
print(extracted_features.shape)

(1068, 2382)


In [10]:
label_arr = labels.values
label_arr = np.argmax(label_arr, axis = 1)

We need to convert these labels also such that there is one label per example (right now there are 150 per example due to the way data was annotated). Also we need to convert the `extracted_features` dataframe into a NumPy array. This is required before attempting any sort of classification, of course.

#### Reducing the labels to the correct format

In [11]:
y_features = np.zeros(extracted_features.shape[0])
for i in range(len(label_arr)) : 
    if i % 150 == 0 : 
        y_features[i // 150] = label_arr[i]
        
# Also converting into Pandas Series for use in extracting relevant features using tsfresh
y = pd.Series(y_features, dtype = int)

In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

extracted_features = impute(extracted_features)
features_filtered = select_features(extracted_features, y)

In [13]:
print(features_filtered.shape)

(1068, 693)


In [14]:
x_features = np.asarray(extracted_features)
print(x_features.shape)
x_features_relevant = np.asarray(features_filtered)
print(x_features_relevant.shape)

(1068, 2382)
(1068, 693)


### Separating the examples into training, validation and testing sets

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x_features_relevant, y_features)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(801, 693)
(267, 693)
(801,)
(267,)


#### Normalizing the features

In [17]:
x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)

# Making the type of the labels `int` explicitly
y_test = y_test.astype(int)
y_train = y_train.astype(int)

In [18]:
class_wt = {0 : 2, 1 : 2, 2 : 2, 3 : 1, 4 : 1}
svm_lin = RandomForestClassifier(n_estimators = 100, class_weight = class_wt)
svm_lin.fit(x_train_norm, y_train)
y_pred = svm_lin.predict(x_test_norm)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_pred = svm_lin.predict(x_train_norm)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[ 0  2  9 21  4]
 [ 0  2  1 16  5]
 [ 0  0 15 29  1]
 [ 0  1  8 87 14]
 [ 0  2  1 31 18]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        36
           1       0.29      0.08      0.13        24
           2       0.44      0.33      0.38        45
           3       0.47      0.79      0.59       110
           4       0.43      0.35      0.38        52

    accuracy                           0.46       267
   macro avg       0.33      0.31      0.30       267
weighted avg       0.38      0.46      0.39       267

[[ 93   0   0   0   0]
 [  0  95   0   0   0]
 [  0   0 168   0   0]
 [  0   3   4 293   1]
 [  0   2   2   1 139]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        93
           1       0.95      1.00      0.97        95
           2       0.97      1.00      0.98       168
           3       1.00      0.97      0.98       301
           4       0.99      0.97

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


There is some improvement in accuracy using both all features and only relevant features. But there is still some bias due to the unbalanced classes. So, need to re-try this using the balanced training set. 

### Balancing classes using **Undersampling** before training
#### Reducing some examples from class 3 (since it has most examples) and also some examples from class 2 and 4

In [48]:
new_features = list()
new_labels = list()
for features, label in zip(x_train_norm, y_train) : 
    if (label == 3) :
        if np.random.random() > 0.55 : 
            new_features.append(features)
            new_labels.append(label)
    elif (label == 2 or label == 4) :
        if np.random.random() < 0.75 : 
            new_features.append(features)
            new_labels.append(label)
    else : 
        new_features.append(features)
        new_labels.append(label)

x_train_norm = new_features
y_train = new_labels

new_features = list()
new_labels = list()
for features, label in zip(x_test_norm, y_test) : 
    if (label == 3) :
        if np.random.random() > 0.5 : 
            new_features.append(features)
            new_labels.append(label)
    elif (label == 2 or label == 4) :
        if np.random.random() < 0.75 : 
            new_features.append(features)
            new_labels.append(label)
    else : 
        new_features.append(features)
        new_labels.append(label)
        
x_test_norm = new_features
y_test = new_labels

In [49]:
x_train_norm = np.vstack(x_train_norm)
x_test_norm = np.vstack(x_test_norm)
y_train = np.vstack(y_train)
y_train = y_train.ravel()
y_test = np.vstack(y_test)
y_test = y_test.ravel()
print(x_train_norm.shape)
print(x_test_norm.shape)
print(y_train.shape)
print(y_test.shape)

(557, 693)
(188, 693)
(557,)
(188,)


In [66]:
# class_wt = {0 : 2, 1 : 2, 2 : 2, 3 : 1, 4 : 2}
svm_lin = sklearn.ensemble.RandomForestClassifier(n_estimators = 100)
svm_lin.fit(x_train_norm, y_train)
y_pred = svm_lin.predict(x_test_norm)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_pred = svm_lin.predict(x_train_norm)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[ 6  2  6  5  4]
 [ 3 12  5  6 11]
 [ 7  2 23 10  6]
 [ 6  6 10 20  7]
 [ 1  3  4 11 12]]
              precision    recall  f1-score   support

         0.0       0.26      0.26      0.26        23
         1.0       0.48      0.32      0.39        37
         2.0       0.48      0.48      0.48        48
         3.0       0.38      0.41      0.40        49
         4.0       0.30      0.39      0.34        31

    accuracy                           0.39       188
   macro avg       0.38      0.37      0.37       188
weighted avg       0.40      0.39      0.39       188

[[106   0   0   0   0]
 [  0  81   0   0   1]
 [  1   0 125   0   0]
 [  0   0   1 131   0]
 [  0   1   0   0 110]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       106
         1.0       0.99      0.99      0.99        82
         2.0       0.99      0.99      0.99       126
         3.0       1.00      0.99      1.00       132
         4.0       0.99      0.99

### Balancing classes using **Oversampling** before training
#### Increasing examples in classes 0, 1, 2 and 4 before training

In [105]:
# Getting the features in NumPy arrays
x_features = np.asarray(extracted_features)
print(x_features.shape)
x_features_relevant = np.asarray(features_filtered)
print(x_features_relevant.shape)

(1068, 2382)
(1068, 693)


In [106]:
# Make train-test split
x_train, x_test, y_train, y_test = train_test_split(x_features, y_features)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(801, 2382)
(267, 2382)
(801,)
(267,)


In [107]:
new_features = list()
new_labels = list()
for features, label in zip(x_train, y_train) : 
    # Roughly doubling the number of examples by repeating them
    if (label == 2 or label == 4) :
        new_features.append(features)
        new_labels.append(label)
        new_features.append(features)
        new_labels.append(label)
    # Tripling the number of examples since they are too less even after doubling for classes 0 and 1
    elif (label == 0 or label == 1) : 
        new_features.append(features)
        new_labels.append(label)
        new_features.append(features)
        new_labels.append(label)
        new_features.append(features)
        new_labels.append(label)
    else : 
        new_features.append(features)
        new_labels.append(label)
        
x_ = np.vstack(new_features)
y_ = np.vstack(new_labels)
y_ = y_.ravel()
print(x_.shape)
print(y_.shape)

(1460, 2382)
(1460,)


In [108]:
from sklearn.utils import shuffle
x_, y_ = shuffle(x_, y_)
x_train_norm = normalize(x_)
x_test_norm = normalize(x_test)
y_train = y_

In [110]:
class_wt = {0 : 0.1, 1 : 0.1, 2 : 1, 3 : 1, 4 : 1}
svm_lin = sklearn.ensemble.RandomForestClassifier(n_estimators = 1000)
svm_lin.fit(x_train_norm, y_train)
y_pred = svm_lin.predict(x_test_norm)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_pred = svm_lin.predict(x_train_norm)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[ 0  1 10 20  5]
 [ 0  3  2 21  8]
 [ 0  0 18 39  7]
 [ 0  1  9 63 18]
 [ 0  1  4 21 16]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        36
         1.0       0.50      0.09      0.15        34
         2.0       0.42      0.28      0.34        64
         3.0       0.38      0.69      0.49        91
         4.0       0.30      0.38      0.33        42

    accuracy                           0.37       267
   macro avg       0.32      0.29      0.26       267
weighted avg       0.34      0.37      0.32       267



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[[279   0   0   0   0]
 [  0 255   0   0   0]
 [  2   0 294   0   2]
 [  0   2   1 315   2]
 [  0   2   2   0 304]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       279
         1.0       0.98      1.00      0.99       255
         2.0       0.99      0.99      0.99       298
         3.0       1.00      0.98      0.99       320
         4.0       0.99      0.99      0.99       308

    accuracy                           0.99      1460
   macro avg       0.99      0.99      0.99      1460
weighted avg       0.99      0.99      0.99      1460

