In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
import pickle

#utils
from sklearn.model_selection import train_test_split
#models
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import joblib


#tuning and preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

#metrics
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

In [2]:
def prep_for_model(df):
    df = df.dropna()
    df = df.drop('screen_width', axis=1)
    df = df.drop('screen_height', axis=1)
    df = df.drop('id', axis=1)
    return df

def split_data(df,test_size=0.33):
    df = prep_for_model(df)
    X = df.iloc[:,0:64]
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size,random_state=42)
    return X_train, X_test, y_train, y_test

def train_test(df):
    X_train, X_test, y_train, y_test = split_data(df,test_size = 0.33)
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, y_train)
    y_pred =  neigh.predict(X_test)

    print("precision: ",metrics.precision_score(y_test, y_pred,average = None))
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("recall:",metrics.recall_score(y_test, y_pred,average = None))


#X_train, X_test, y_train, y_test = split_data(df)


In [8]:
df_khaled_relabeled = pd.read_csv('C:/Users/KIMO/Desktop/left_hand_khaled_relabeled.csv',low_memory = False)
df_mahmoud = pd.read_csv('C:/Users/KIMO/Desktop/left_hand_zoir_relabeled.csv',low_memory = False)

df_both_relabeled = df_khaled_relabeled.append(df_mahmoud)
train_test(df_both_relabeled)

precision:  [0.84022111 0.59839605 0.76975287]
Accuracy: 0.7688518717535375
recall: [0.86833515 0.41810345 0.85551257]


In [10]:
def split_data_multi(df,finger,test_size=0.33):
    df = df.dropna()
    cols = [col for col in df.columns if finger in col]
    df = df[cols]
    
    X = df.iloc[:,0:12]
    y = df[finger + "_label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size,random_state=42)
    return X_train, X_test, y_train, y_test 
    return df

x_train_index,x_test_index,y_train_index,y_test_index = split_data_multi(df_both_relabeled,"index")
x_train_middle,x_test_middle,y_train_middle,y_test_middle = split_data_multi(df_both_relabeled,"middle")
x_train_ring,x_test_ring,y_train_ring,y_test_ring = split_data_multi(df_both_relabeled,"ring")
x_train_pinky,x_test_pinky,y_train_pinky,y_test_pinky = split_data_multi(df_both_relabeled,"pinky")

In [11]:
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier()
n_estimators = [5,20,50,100] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}


rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)

rf_random.fit(x_train_index,y_train_index)


Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      120],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 3, 4],
                                        'min_samples_split': [2, 6, 10],
                                        'n_estimators': [5, 20, 50, 100]},
                   random_state=35, verbose=2)

In [16]:
#max_depth=[2, 8, 16,32,64,128, 256,512,1024]
#n_estimators = [64, 128, 256,512,1024]
max_depth=[32]
n_estimators = [256,512,1024,2048]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)
#best 32 and 1024 or 512 maybe 256
# Build the grid search
dfrst = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
grid = GridSearchCV(estimator=dfrst, param_grid=param_grid, cv = 5)
grid_results = grid.fit(x_train_index,y_train_index)

# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [0.9806352  0.98094398 0.98120865 0.9814292 ], using {'max_depth': 32, 'n_estimators': 2048}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,10.147315,0.26709,0.373943,0.025798,32,256,"{'max_depth': 32, 'n_estimators': 256}",0.984341,0.975959,0.981032,0.979047,0.982797,0.980635,0.002931,4
1,19.297345,0.164075,0.828071,0.04851,32,512,"{'max_depth': 32, 'n_estimators': 512}",0.983679,0.976401,0.982356,0.979488,0.982797,0.980944,0.002671,3
2,38.286,0.282129,1.634193,0.199172,32,1024,"{'max_depth': 32, 'n_estimators': 1024}",0.98412,0.976621,0.981694,0.979709,0.983899,0.981209,0.002802,2
3,77.589515,0.966686,3.521441,0.39415,32,2048,"{'max_depth': 32, 'n_estimators': 2048}",0.984561,0.976621,0.982576,0.979709,0.983679,0.981429,0.002907,1


In [17]:
# defining parameter range
param_grid = {'C':[0.0001,0.01,0.1,10,100,1000] ,
              'gamma': [ 0.0001,0.01,0.1,10,100,1000],
              'kernel': ['rbf','sigmoid']}
clf = svm.SVC()
grid = GridSearchCV(clf, param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(x_train_index,y_train_index)

# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END C=0.0001, gamma=0.0001, kernel=rbf;, score=0.506 total time=  22.1s
[CV 2/5] END C=0.0001, gamma=0.0001, kernel=rbf;, score=0.505 total time=  21.9s
[CV 3/5] END C=0.0001, gamma=0.0001, kernel=rbf;, score=0.505 total time=  21.4s
[CV 4/5] END C=0.0001, gamma=0.0001, kernel=rbf;, score=0.505 total time=  21.1s
[CV 5/5] END C=0.0001, gamma=0.0001, kernel=rbf;, score=0.505 total time=  22.7s
[CV 1/5] END C=0.0001, gamma=0.0001, kernel=sigmoid;, score=0.506 total time=  17.1s
[CV 2/5] END C=0.0001, gamma=0.0001, kernel=sigmoid;, score=0.505 total time=  17.4s
[CV 3/5] END C=0.0001, gamma=0.0001, kernel=sigmoid;, score=0.505 total time=  16.9s
[CV 4/5] END C=0.0001, gamma=0.0001, kernel=sigmoid;, score=0.505 total time=  15.5s
[CV 5/5] END C=0.0001, gamma=0.0001, kernel=sigmoid;, score=0.505 total time=  16.0s
[CV 1/5] END ..C=0.0001, gamma=0.01, kernel=rbf;, score=0.506 total time=  21.6s
[CV 2/5] END ..C=0.0001, ga