# Classify [LIBRAS](http://archive.ics.uci.edu/ml/datasets/Libras+Movement) hand movements from video data mapped to 90 features.

## Methods implemented:
### Dimensionality Reduction:
* PCA

### Machine Learning from scikit-learn:
* LogisticRegression
* MLPClassifier
* DecisionTreeClassifier
* GaussianProcessClassifier
* KNeighborsClassifier

### Model Optimization:
* GridSearchCV - found in helper.py file


## Import and explore data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from IPython.display import display

seed = 0

names = []
names+= list('Input {}'.format(i) for i in range(1,91))
names.append('class')

data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data', names = names)
data.head()

Unnamed: 0,Input 1,Input 2,Input 3,Input 4,Input 5,Input 6,Input 7,Input 8,Input 9,Input 10,...,Input 82,Input 83,Input 84,Input 85,Input 86,Input 87,Input 88,Input 89,Input 90,class
0,0.79691,0.38194,0.79691,0.37731,0.79884,0.37731,0.79497,0.37731,0.77563,0.35417,...,0.51389,0.39845,0.42593,0.47389,0.36111,0.55899,0.3125,0.6383,0.29398,1
1,0.67892,0.27315,0.68085,0.27315,0.68085,0.27315,0.68085,0.27315,0.67892,0.26852,...,0.57407,0.17795,0.63657,0.17215,0.67361,0.17021,0.69213,0.17215,0.69213,1
2,0.72147,0.23611,0.7234,0.23611,0.7234,0.23611,0.7234,0.23611,0.7234,0.23611,...,0.30556,0.59768,0.25926,0.67118,0.25231,0.73501,0.2662,0.78143,0.27778,1
3,0.5648,0.32407,0.56286,0.32407,0.56093,0.32407,0.55899,0.32407,0.55899,0.32407,...,0.49074,0.26306,0.42361,0.33269,0.34722,0.41006,0.28009,0.4913,0.24306,1
4,0.67118,0.38426,0.67118,0.38657,0.67311,0.38657,0.67311,0.38426,0.67311,0.37963,...,0.76389,0.44101,0.6412,0.45068,0.54167,0.47776,0.44213,0.53191,0.34259,1


In [2]:
X = data.drop(['class'],axis=1)
y = data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
display(data.describe())

Unnamed: 0,Input 1,Input 2,Input 3,Input 4,Input 5,Input 6,Input 7,Input 8,Input 9,Input 10,...,Input 82,Input 83,Input 84,Input 85,Input 86,Input 87,Input 88,Input 89,Input 90,class
count,360.0,360.0,360.0,360.0,360.0,360.0,360.0,360.0,360.0,360.0,...,360.0,360.0,360.0,360.0,360.0,360.0,360.0,360.0,360.0,360.0
mean,0.566613,0.555967,0.566377,0.555478,0.56564,0.554552,0.564867,0.552855,0.563094,0.548579,...,0.474814,0.487229,0.469181,0.488287,0.464905,0.490259,0.46376,0.492376,0.463767,8.0
std,0.198916,0.187485,0.198306,0.18732,0.197863,0.187066,0.195968,0.186777,0.192667,0.185901,...,0.172685,0.20988,0.174674,0.213416,0.178629,0.218753,0.184384,0.225507,0.190831,4.326507
min,0.090909,0.14815,0.085106,0.14815,0.085106,0.14815,0.088975,0.14583,0.092843,0.14583,...,0.076389,0.059961,0.050926,0.029014,0.030093,0.011605,0.009259,0.005803,0.006944,1.0
25%,0.444875,0.40741,0.440043,0.40972,0.438587,0.40741,0.43714,0.40972,0.435688,0.40972,...,0.35417,0.32737,0.346642,0.329785,0.34259,0.31963,0.332753,0.31335,0.328122,4.0
50%,0.585105,0.58102,0.585105,0.582175,0.582205,0.582175,0.579305,0.564815,0.572535,0.556715,...,0.48611,0.48162,0.4838,0.474855,0.4838,0.47969,0.48264,0.48743,0.48843,8.0
75%,0.720023,0.72454,0.721952,0.720488,0.720505,0.72222,0.720023,0.71817,0.714215,0.71991,...,0.5978,0.654737,0.59491,0.656675,0.59259,0.66731,0.6088,0.671662,0.61574,12.0
max,0.9323,0.88657,0.93037,0.88889,0.9323,0.88426,0.9323,0.88194,0.9323,0.875,...,0.83102,0.97099,0.81944,0.97872,0.82407,0.98839,0.87269,1.0,0.88426,15.0


Data appears to have already been scaled, therefore this step is not necessary.

## Reduce dimensionality using PCA
The performance of the various algorithms using the first 4, 6, and 10 principal components will be assessed.

In [3]:
pca_4 = PCA(n_components = 4)
pca_4.fit(X_train)
pca_6 = PCA(n_components = 6)
pca_6.fit(X_train)
pca_10 = PCA(n_components = 10)
pca_10.fit(X_train)

X_train_4 = pca_4.transform(X_train)
X_test_4 = pca_4.transform(X_test)
X_train_6 = pca_6.transform(X_train)
X_test_6 = pca_6.transform(X_test)
X_train_10 = pca_10.transform(X_train)
X_test_10 = pca_10.transform(X_test)

print 'Variance explained by 4 dimensions: {:.4f}'.format(sum(pca_4.explained_variance_ratio_))
print 'Variance explained by 6 dimensions: {:.4f}'.format(sum(pca_6.explained_variance_ratio_))
print 'Variance explained by 10 dimensions: {:.4f}'.format(sum(pca_10.explained_variance_ratio_))

Variance explained by 4 dimensions: 0.7706
Variance explained by 6 dimensions: 0.8816
Variance explained by 10 dimensions: 0.9737


## Train and test models

In [4]:
import helper

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier


clf_A = LogisticRegression(random_state = seed)
clf_B = MLPClassifier(random_state = seed)
clf_C = DecisionTreeClassifier(random_state = seed)
clf_D = GaussianProcessClassifier(random_state = seed)
clf_E = KNeighborsClassifier()

models = [clf_A, clf_B, clf_C, clf_D, clf_E]
#models = [clf_B]

print 'RESULTS FOR 4 DIMENSIONS\n'
helper.train_predict(X_train_4, X_test_4, y_train, y_test, models, con_mat=False)
print '\nRESULTS FOR 6 DIMENSIONS\n'
helper.train_predict(X_train_6, X_test_6, y_train, y_test, models, con_mat=False)
print '\nRESULTS FOR 10 DIMENSIONS\n'
helper.train_predict(X_train_10, X_test_10, y_train, y_test, models, con_mat=False)

RESULTS FOR 4 DIMENSIONS

Results for LogisticRegression:
Accuracy = 0.2917




Results for MLPClassifier:
Accuracy = 0.4167
Results for DecisionTreeClassifier:
Accuracy = 0.5278
Results for GaussianProcessClassifier:
Accuracy = 0.3889
Results for KNeighborsClassifier:
Accuracy = 0.5417

RESULTS FOR 6 DIMENSIONS

Results for LogisticRegression:
Accuracy = 0.4583
Results for MLPClassifier:
Accuracy = 0.6944
Results for DecisionTreeClassifier:
Accuracy = 0.6667
Results for GaussianProcessClassifier:
Accuracy = 0.5833
Results for KNeighborsClassifier:
Accuracy = 0.6111

RESULTS FOR 10 DIMENSIONS

Results for LogisticRegression:
Accuracy = 0.6250
Results for MLPClassifier:
Accuracy = 0.8194
Results for DecisionTreeClassifier:
Accuracy = 0.7361
Results for GaussianProcessClassifier:
Accuracy = 0.7778
Results for KNeighborsClassifier:
Accuracy = 0.7639


## Optimize models using first 6 and 10 principal components
Performance with the first 4 principal components is extrmeley poor and will therefore not be considered further for now. Using such few components may be revisited in the future.

In [5]:
params_A = [{'C':np.arange(1,5.1,0.1), 'class_weight':['balanced', None]}]
params_B = [{'activation':['identity','logistic','tanh','relu'], 'solver':['lbfgs', 'sgd', 'adam'], 'alpha':np.arange(1e-4,1.05e-3,5e-5)}]
params_C = [{'criterion':['gini','entropy'],'max_depth':range(1,31), 'min_samples_split':range(2,6), 'class_weight':['balanced', None]}]
params_D = [{'n_restarts_optimizer':range(6)}]
params_E = [{'n_neighbors':range(1,11), 'p':range(1,6)}]

models = [clf_A, clf_B, clf_C, clf_D, clf_E]
#models = [clf_B]
params = [params_A, params_B, params_C, params_D, params_E]
#params = [params_B]

print 'RESULTS FOR 6 DIMENSIONS\n'
helper.optimize_models(X_train_6, X_test_6, y_train, y_test, models, params, seed=seed, con_mat = False)
print '\nRESULTS FOR 10 DIMENSIONS\n'
helper.optimize_models(X_train_10, X_test_10, y_train, y_test, models, params, seed=seed, con_mat = False)

RESULTS FOR 6 DIMENSIONS

Results for LogisticRegression:
Accuracy = 0.5000
{'C': 3.1000000000000019, 'class_weight': 'balanced'}
Results for MLPClassifier:
Accuracy = 0.8194
{'alpha': 0.00035000000000000005, 'activation': 'tanh', 'solver': 'lbfgs'}
Results for DecisionTreeClassifier:
Accuracy = 0.6944
{'min_samples_split': 3, 'criterion': 'gini', 'max_depth': 12, 'class_weight': 'balanced'}
Results for GaussianProcessClassifier:
Accuracy = 0.5833
{'n_restarts_optimizer': 0}
Results for KNeighborsClassifier:
Accuracy = 0.7917
{'n_neighbors': 1, 'p': 4}

RESULTS FOR 10 DIMENSIONS

Results for LogisticRegression:
Accuracy = 0.6528
{'C': 4.8000000000000034, 'class_weight': 'balanced'}
Results for MLPClassifier:
Accuracy = 0.8333
{'alpha': 0.00015000000000000001, 'activation': 'logistic', 'solver': 'lbfgs'}
Results for DecisionTreeClassifier:
Accuracy = 0.7222
{'min_samples_split': 4, 'criterion': 'entropy', 'max_depth': 9, 'class_weight': 'balanced'}
Results for GaussianProcessClassifier: