# Actigraphy data classification

### Authors:
* Alexsandro Santos da Rosa Júnior
* Giovanni Benedetti da Rosa
* Paulo Roberto de Moura Júnior

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.exceptions import ConvergenceWarning

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Data

In [None]:
X = pd.read_csv('extracted_features.csv')
y = pd.read_csv('timeseries_classification.csv')

In [None]:
X.index = X.pop(X.columns[0]).values
y.drop('index',axis=1,inplace=True)
y.index = y.pop('id').values

### Removing variables with high correlation

In [None]:
corr = X.corr().abs()
_X = X.copy()

In [None]:
# Select upper triangle of correlation matrix
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

# Find features with correlation greater than 80%
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]

# Drop features 
_X.drop(to_drop, axis=1, inplace=True)

In [None]:
to_drop

['var',
 '0.75',
 'var_day_1',
 'var_day_3',
 'var_day_5',
 'var_day_7',
 'hjorth_complexity',
 'permutation_entropy_n_3',
 'permutation_entropy_n_4',
 'above_0.8*max',
 'above_0.9*max',
 'above_0.7*mean',
 'above_0.8*mean',
 'above_0.9*mean',
 'above_mean',
 'abs_energy',
 'cid_ce',
 'fft_skew',
 'fft_kurtosis',
 'skewness',
 'intradaily_variability']

## Models

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

cv=10

### Quadratic Discriminant Analysis (QDA)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_clf = QuadraticDiscriminantAnalysis()
pipe_qda_clf = make_pipeline(StandardScaler(), qda_clf)
qda_score = cross_val_score(pipe_qda_clf, _X, y.values.ravel(), scoring='accuracy', cv=cv)
print(f'QDA mean score: {qda_score.mean():.2f} \nStandard deviation: {qda_score.std():.2f}')

QDA mean score: 0.41 
Standard deviation: 0.03


### Random Forest

#### Model performance before hyperparameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)
pipe_rf_clf = make_pipeline(StandardScaler(), rf_clf)
rf_score = cross_val_score(pipe_rf_clf, _X, y.values.ravel(), scoring='accuracy', cv=cv)
print(f'Random Forest mean score: {rf_score.mean():.2f} \nStandard deviation: {rf_score.std():.2f}')

Random Forest mean score: 0.40 
Standard deviation: 0.09


#### Hyperparameter tuning

In [None]:
rf_param_grid = {
    'rf__n_estimators': [25, 50, 100, 150],
    'rf__max_features': ['sqrt', 'log2', None],
    'rf__max_depth': [3, 6, 9],
    'rf__max_leaf_nodes': [3, 6, 9],
}

In [None]:
rf_pipe = Pipeline(steps=[("scaler", StandardScaler()), ("rf", RandomForestClassifier(random_state=42))], verbose=2)

rf_search = GridSearchCV(rf_pipe, rf_param_grid, scoring='accuracy', cv=10, n_jobs=-1)
rf_search.fit(_X, y.values.ravel())
print("Best parameter (CV score=%0.3f):" % rf_search.best_score_)
print(rf_search.best_params_)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.2s
Best parameter (CV score=0.428):
{'rf__max_depth': 3, 'rf__max_features': 'sqrt', 'rf__max_leaf_nodes': 3, 'rf__n_estimators': 100}


#### Model performance after hyperparameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42, max_depth=3, max_features='sqrt', max_leaf_nodes=3, n_estimators=100)
pipe_rf_clf = make_pipeline(StandardScaler(), rf_clf)
rf_score = cross_val_score(pipe_rf_clf, _X, y.values.ravel(), scoring='accuracy', cv=cv)
print(f'Random Forest mean score: {rf_score.mean():.2f} \nStandard deviation: {rf_score.std():.2f}')

Random Forest mean score: 0.43 
Standard deviation: 0.09


### Multi-layer Perceptron classifier

#### Hyperparameter tuning

In [None]:
param_grid = [
        {
            'nn__activation' : ['identity', 'logistic', 'tanh', 'relu'],
            'nn__solver' : ['lbfgs', 'sgd', 'adam'],
            'nn__hidden_layer_sizes': [
             (1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,),(11,), (12,),(13,),(14,),(15,),(16,),(17,),(18,),(19,),(20,),(21,)
             ]
        }
]

In [None]:
from sklearn.neural_network import MLPClassifier

pipe = Pipeline(steps=[("scaler", StandardScaler()), ("nn", MLPClassifier(random_state=42, max_iter=300))], verbose=2)

search = GridSearchCV(pipe, param_grid, scoring='accuracy', cv=10, n_jobs=-1)
search.fit(_X, y.values.ravel())
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

[Pipeline] ............ (step 1 of 2) Processing scaler, total=   0.0s
[Pipeline] ................ (step 2 of 2) Processing nn, total=   0.1s
Best parameter (CV score=0.441):
{'nn__activation': 'relu', 'nn__hidden_layer_sizes': (2,), 'nn__solver': 'sgd'}


#### Model performance

In [None]:
mlp_clf = MLPClassifier(random_state=42, max_iter=300, hidden_layer_sizes=(6,), solver='sgd')
pipe_nn_clf = make_pipeline(StandardScaler(), mlp_clf)
nn_score = cross_val_score(pipe_nn_clf, X, y.values.ravel(), scoring='accuracy', cv=cv)
print(f'MLP Classifier mean score: {nn_score.mean():.2f} \nStandard deviation: {nn_score.std():.2f}')

MLP Classifier mean score: 0.51 
Standard deviation: 0.16


## Other attemtps

### Suport Vector Classification (SVC)

In [None]:
from sklearn.svm import SVC

svc_clf = SVC(random_state=42)
pipe_svc_clf = make_pipeline(StandardScaler(), svc_clf)
svc_score = cross_val_score(pipe_svc_clf, _X, y.values.ravel(), scoring='accuracy', cv=cv)
print(f'Suport Vector Classifier mean score: {svc_score.mean():.2f} \nStandard deviation: {svc_score.std():.2f}')

Suport Vector Classifier mean score: 0.36 
Standard deviation: 0.09


### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()
pipe_nb_clf = make_pipeline(StandardScaler(), nb_clf)
nb_score = cross_val_score(pipe_nb_clf, _X, y.values.ravel(), scoring='accuracy', cv=cv)
print(f'Naive Bayes mean score: {nb_score.mean():.2f} \nStandard deviation: {nb_score.std():.2f}')

Naive Bayes mean score: 0.38 
Standard deviation: 0.10


### Voting Classifer

In [None]:
from sklearn.ensemble import VotingClassifier

estimators = [('nb', nb_clf), ('rf', rf_clf), ('svc', svc_clf), ('mlp', mlp_clf), ('qda', qda_clf)]
eclf = VotingClassifier(estimators=estimators,voting='hard')
pipe_eclf = make_pipeline(StandardScaler(), eclf)
eclf_score = cross_val_score(pipe_eclf, X, y.values.ravel(), scoring='accuracy', cv=cv)
print(f'Voting Classifier mean score: {eclf_score.mean():.2f} \nStandard deviation: {eclf_score.std():.2f}')

Voting Classifier mean score: 0.37 
Standard deviation: 0.15
