In [1]:
%matplotlib inline
import pandas as pd
import datetime
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tsfresh import extract_features
from tsfresh import extract_relevant_features
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import normalize

### Feature Extraction

In [2]:
data = pd.read_csv('../data/data_only_tsfresh_compatible.csv', names = ['x_acc', 'y_acc', 'z_acc', 'id'])
labels = pd.read_csv('../data/labels_only.csv', names = ['Blocking', 'Dodging', 'Inactive', 'Moving', 'Sprinting'])

In [3]:
extracted_features = extract_features(data, column_id = "id", column_sort = None, column_kind = None, column_value = None)

Feature Extraction: 100%|██████████| 10/10 [07:26<00:00, 38.74s/it]


In [4]:
print(extracted_features.shape)

(1068, 2382)


In [5]:
label_arr = labels.values
label_arr = np.argmax(label_arr, axis = 1)

We need to convert these labels also such that there is one label per example (right now there are 150 per example due to the way data was annotated). Also we need to convert the `extracted_features` dataframe into a NumPy array. This is required before attempting any sort of classification, of course.

#### Reducing the labels to the correct format

In [6]:
y_features = np.zeros(extracted_features.shape[0])
for i in range(len(label_arr)) : 
    if i % 150 == 0 : 
        y_features[i // 150] = label_arr[i]
        
# Also converting into Pandas Series for use in extracting relevant features using tsfresh
y = pd.Series(y_features, dtype = int)

In [None]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y)

In [8]:
print(features_filtered.shape)

(1068, 693)


In [9]:
x_features = np.asarray(extracted_features)
print(x_features.shape)
x_features_relevant = np.asarray(features_filtered)
print(x_features_relevant.shape)

(1068, 2382)
(1068, 693)


### Separating the examples into training, validation and testing sets

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x_features_relevant, y_features)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(801, 693)
(267, 693)
(801,)
(267,)


#### Normalizing the features

In [47]:
x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)

In [28]:
class_wt = {0 : 2, 1 : 2, 2 : 2, 3 : 1, 4 : 1}
svm_lin = RandomForestClassifier(n_estimators = 100, class_weight = class_wt)
svm_lin.fit(x_train_norm, y_train)
y_pred = svm_lin.predict(x_test_norm)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_pred = svm_lin.predict(x_train_norm)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[ 0  0  6 32  4]
 [ 0  1  3 25  9]
 [ 1  0 16 22  1]
 [ 0  0 12 77  6]
 [ 0  1  1 40 10]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        42
         1.0       0.50      0.03      0.05        38
         2.0       0.42      0.40      0.41        40
         3.0       0.39      0.81      0.53        95
         4.0       0.33      0.19      0.24        52

    accuracy                           0.39       267
   macro avg       0.33      0.29      0.25       267
weighted avg       0.34      0.39      0.30       267

[[ 87   0   0   0   0]
 [  0  80   0   0   1]
 [  0   0 169   2   2]
 [  0   2   1 312   1]
 [  0   0   1   1 142]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        87
         1.0       0.98      0.99      0.98        81
         2.0       0.99      0.98      0.98       173
         3.0       0.99      0.99      0.99       316
         4.0       0.97      0.99

There is some improvement in accuracy using both all features and only relevant features. But there is still some bias due to the unbalanced classes. So, need to re-try this using the balanced training set. 

#### Reducing some examples from class 3 (since it has most examples)

In [48]:
new_features = list()
new_labels = list()
for features, label in zip(x_train_norm, y_train) : 
    if (label == 3) :
        if np.random.random() > 0.55 : 
            new_features.append(features)
            new_labels.append(label)
    elif (label == 2 or label == 4) :
        if np.random.random() < 0.75 : 
            new_features.append(features)
            new_labels.append(label)
    else : 
        new_features.append(features)
        new_labels.append(label)

x_train_norm = new_features
y_train = new_labels

new_features = list()
new_labels = list()
for features, label in zip(x_test_norm, y_test) : 
    if (label == 3) :
        if np.random.random() > 0.5 : 
            new_features.append(features)
            new_labels.append(label)
    elif (label == 2 or label == 4) :
        if np.random.random() < 0.75 : 
            new_features.append(features)
            new_labels.append(label)
    else : 
        new_features.append(features)
        new_labels.append(label)
        
x_test_norm = new_features
y_test = new_labels

In [49]:
x_train_norm = np.vstack(x_train_norm)
x_test_norm = np.vstack(x_test_norm)
y_train = np.vstack(y_train)
y_train = y_train.ravel()
y_test = np.vstack(y_test)
y_test = y_test.ravel()
print(x_train_norm.shape)
print(x_test_norm.shape)
print(y_train.shape)
print(y_test.shape)

(557, 693)
(188, 693)
(557,)
(188,)


In [52]:
# class_wt = {0 : 2, 1 : 2, 2 : 2, 3 : 1, 4 : 2}
svm_lin = RandomForestClassifier(n_estimators = 1000, class_weight = None)
svm_lin.fit(x_train_norm, y_train)
y_pred = svm_lin.predict(x_test_norm)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_pred = svm_lin.predict(x_train_norm)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[ 5  3  5  7  3]
 [ 0 13  5  9 10]
 [ 2  3 25 11  7]
 [ 4  4 13 20  8]
 [ 0  3  3  9 16]]
              precision    recall  f1-score   support

         0.0       0.45      0.22      0.29        23
         1.0       0.50      0.35      0.41        37
         2.0       0.49      0.52      0.51        48
         3.0       0.36      0.41      0.38        49
         4.0       0.36      0.52      0.43        31

    accuracy                           0.42       188
   macro avg       0.43      0.40      0.40       188
weighted avg       0.43      0.42      0.42       188

[[105   0   1   0   0]
 [  0  81   0   0   1]
 [  0   0 126   0   0]
 [  0   0   1 131   0]
 [  0   1   0   0 110]]
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00       106
         1.0       0.99      0.99      0.99        82
         2.0       0.98      1.00      0.99       126
         3.0       1.00      0.99      1.00       132
         4.0       0.99      0.99