#### This notebook contains code for testing `tsfresh` feature extraction and SMOTE on running standard deviation of raw data.

In [57]:
%matplotlib inline
import pandas as pd
import datetime
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import numpy as np
from tsfresh import extract_features
from tsfresh import extract_relevant_features
import sklearn
import sklearn.naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import normalize
from imblearn.over_sampling import SMOTE
from collections import Counter
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

In [58]:
data = pd.read_csv('../data/data_only_tsfresh_compatible.csv', names = ['x_acc', 'y_acc', 'z_acc', 'id'])
labels = pd.read_csv('../data/labels_only.csv', names = ['Blocking', 'Dodging', 'Inactive', 'Moving', 'Sprinting'])

In [59]:
x_acc = np.asarray(data['x_acc'])
y_acc = np.asarray(data['y_acc'])
z_acc = np.asarray(data['z_acc'])

def running_std_dev(x, window_size = 50) : 
    num_examples = len(x) // 150
    out = np.zeros((len(x) - (num_examples * window_size)))
    for i in range(num_examples) :
        for j in range(150 - window_size) : 
            out[(i * (150 - window_size)) + j] = np.std(x[(i * 150) + j : (i * 150) + j + window_size])
            
    return out

outp = running_std_dev(x_acc, window_size = 20)
print(outp.shape)

(138840,)


In [60]:
window_size = 20
x_acc_std = running_std_dev(x_acc, window_size = window_size)
y_acc_std = running_std_dev(y_acc, window_size = window_size)
z_acc_std = running_std_dev(z_acc, window_size = window_size)

In [61]:
# Generating `id` column needed in dataframe for tsfresh
idx = list()
k = 0
for i in range(len(data) // 150) : 
    for j in range(150 - window_size) : 
        idx.append(k)
    k = k + 1
    
print(len(idx))

138840


In [62]:
# Creating dataframe of standard deviation data for later feature extraction using tsfresh
# and also saving to memory for quick future use
data_std = pd.DataFrame()
data_std['id'] = idx
data_std['x_acc'] = x_acc_std
data_std['y_acc'] = y_acc_std
data_std['z_acc'] = z_acc_std
print(data_std.head())
data_std.to_csv('../data/std_dev_data_only.csv', header = None, index = None)

   id     x_acc     y_acc     z_acc
0   0  0.005440  0.004272  0.009425
1   0  0.005813  0.002630  0.011778
2   0  0.005805  0.002815  0.011892
3   0  0.005278  0.003170  0.011793
4   0  0.004613  0.003832  0.012200


In [63]:
# Loading standard deviation data from memory
data_std = pd.read_csv('../data/std_dev_data_only.csv', names = ['id', 'x_acc', 'y_acc', 'z_acc'])
print(data_std.tail())

          id     x_acc     y_acc     z_acc
138835  1067  0.107691  0.087052  0.049347
138836  1067  0.109661  0.100508  0.052019
138837  1067  0.107994  0.130055  0.063816
138838  1067  0.107784  0.160016  0.082226
138839  1067  0.108898  0.162559  0.082415


In [64]:
# Converting labels into correct format (since size of each example has decreased after finding running standard deviation)
label_arr = labels.values
label_arr = np.argmax(label_arr, axis = 1)

y_features = np.zeros(len(data_std) // (150 - window_size))
for i in range(len(label_arr)) : 
    if i % 150 == 0 : 
        y_features[i // 150] = label_arr[i]
        
# Also converting into Pandas Series for use in extracting relevant features using tsfresh
y = pd.Series(y_features, dtype = int)

In [65]:
# Now using tsfresh to extract features
extracted_features = extract_features(data_std, column_id = "id", column_sort = None, column_kind = None, column_value = None)
print(extracted_features.shape)

Feature Extraction: 100%|██████████| 10/10 [05:55<00:00, 32.40s/it]


(1068, 2382)


In [None]:
extracted_features = impute(extracted_features)
features_filtered = select_features(extracted_features, y)
print(features_filtered.shape)

In [67]:
print(features_filtered.shape)

(1068, 1048)


In [68]:
# Convert feature dataframes into numpy arrays (required for training in sklearn)
x_features = np.asarray(extracted_features)
print(x_features.shape)
x_features_relevant = np.asarray(features_filtered)
print(x_features_relevant.shape)
# Gives the number of examples per label
print(Counter(y_features))

(1068, 2382)
(1068, 1048)
Counter({3.0: 411, 2.0: 213, 4.0: 196, 0.0: 129, 1.0: 119})


### Shuffle and split into train/test datasets and normalize the features

In [73]:
x_train, x_test, y_f, y_test = train_test_split(x_features_relevant, y_features)
print(x_train.shape)
print(x_test.shape)
print(y_f.shape)
print(y_test.shape)
x_f = normalize(x_train)
x_test_norm = normalize(x_test)
print(Counter(y_f))

(801, 1048)
(267, 1048)
(801,)
(267,)
Counter({3.0: 313, 2.0: 157, 4.0: 144, 0.0: 95, 1.0: 92})


In [74]:
sm = SMOTE(random_state = 33)
x_train_norm, y_train = sm.fit_resample(x_f, y_f)
print(Counter(y_train))
print(y_train.dtype)
y_train = y_train.astype(int)
print(y_train.dtype)
y_test = y_test.astype(int)

Counter({3.0: 313, 2.0: 313, 4.0: 313, 0.0: 313, 1.0: 313})
float64
int64


In [76]:
svm_lin = RandomForestClassifier(n_estimators = 300)
svm_lin.fit(x_train_norm, y_train)
y_pred = svm_lin.predict(x_test_norm)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
y_pred = svm_lin.predict(x_train_norm)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[ 5  1  8 13  7]
 [ 0  3  4  9 11]
 [ 4  1 28 15  8]
 [11 11 20 36 20]
 [ 4  5  5 10 28]]
              precision    recall  f1-score   support

           0       0.21      0.15      0.17        34
           1       0.14      0.11      0.12        27
           2       0.43      0.50      0.46        56
           3       0.43      0.37      0.40        98
           4       0.38      0.54      0.44        52

    accuracy                           0.37       267
   macro avg       0.32      0.33      0.32       267
weighted avg       0.36      0.37      0.36       267

[[312   0   1   0   0]
 [  0 313   0   0   0]
 [  0   0 309   2   2]
 [  0   1   0 312   0]
 [  0   0   1   1 311]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       313
           1       1.00      1.00      1.00       313
           2       0.99      0.99      0.99       313
           3       0.99      1.00      0.99       313
           4       0.99      0.99