In [1]:
# Basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
 
# Sklearn modules & classes
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import metrics
from scipy.stats import norm
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report

In [2]:
kids_array = np.genfromtxt('kids_2022_cleaned.dat', delimiter='\t', dtype=None, encoding=None)
kids_df = pd.DataFrame(kids_array)
kids_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,ids,dl_pp_time,du_pp_time,fl_pp_time,fu_pp_time,dl_move_time,du_move_time,fl_move_time,fu_move_time,dl_num_of_shots,...,fl_wps,fu_wps,tol_preplan_time,tol_movement_time,tol_num_of_moves,tol_excess_moves,tol_accuracy,tol_efficient_accuracy,tol_weighted_performance.score,mathScores
1,GAMAR11006,8.09,7.31,5.4,6.56,37.1,29.81,5.01,55.13,5,...,0,0,4086.17,11004.75,63,16,8,5,12,48
2,GAMAR11002,15.09,4.7,13.59,13.77,16.2,66.93,17.67,19.67,2,...,0,0,4532.67,13706.58,71,24,5,1,5,12
3,GAMAR11015,5.48,2.3,2.8,9.84,14.12,3.53,5.21,39,4,...,2,0,3868,9133.67,55,8,10,8,26,43
4,GAMAR11010,1.53,13.97,1.46,14.66,16.28,129.39,7.23,39.93,3,...,0,4,1745.33,10493.75,62,15,10,5,16,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,MOBAU12014,5.8,18.62,23.93,14.65,18.98,49.08,20.69,31.66,4,...,0,0,3087,11982,59,12,9,6,25,
68,JOTON15007,5.75,19.91,3.9,10.78,23.37,26.34,19.67,83.89,5,...,3,0,2410.08,9837,56,9,10,8,26,
69,GARIC09019,7.82,8.42,2.65,12.33,4.53,42.65,4.28,87.17,2,...,0,0,2686.75,9439.67,64,17,10,7,24,29
70,,5.26,7.73,3.59,6.93,10.86,67,4.98,68.35,3,...,0,0,2122.17,10349.17,66,19,9,7,28,


In [3]:
# isolate vars: dl_acc, fl_acc, du_eff_acc, fu_eff_acc, tol_efficient_accuracy
kids_ea = kids_df[[21, 22, 23, 24, 34]]

# clean up data
kids_ea.rename(columns = {21: "dl_eff_acc", 22: "fl_eff_acc", 23: "du_eff_acc", 24: "fu_eff_acc", 34: "tol_eff_acc"}, inplace = True)
kids_ea = kids_ea.drop([0])

# cast to int
kids_ea["dl_eff_acc"], kids_ea["fl_eff_acc"], kids_ea["du_eff_acc"], kids_ea["fu_eff_acc"], kids_ea["tol_eff_acc"] = kids_ea.dl_eff_acc.astype(int), kids_ea.fl_eff_acc.astype(int), kids_ea.du_eff_acc.astype(int), kids_ea.fu_eff_acc.astype(int), kids_ea.tol_eff_acc.astype(int)

# create fbe_eff_acc var
kids_ea["fbe_eff_acc"] = kids_ea.dl_eff_acc.astype(int) + kids_ea.fl_eff_acc.astype(int) + kids_ea.du_eff_acc.astype(int) + kids_ea.fu_eff_acc.astype(int)

kids_ea

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,dl_eff_acc,fl_eff_acc,du_eff_acc,fu_eff_acc,tol_eff_acc,fbe_eff_acc
1,0,0,0,0,5,0
2,0,0,0,0,1,0
3,1,1,1,0,8,3
4,0,0,0,1,5,1
5,1,0,0,1,6,2
...,...,...,...,...,...,...
67,0,0,0,0,6,0
68,0,1,1,0,8,2
69,0,0,0,0,7,0
70,0,0,0,0,7,0


In [4]:
# X = (fbe_eff_acc)
X = kids_ea[["fbe_eff_acc"]].values.tolist()

# dichotomous classification var: y = (0: low planning, 1: high planning)
y = []
mean = kids_ea["tol_eff_acc"].mean()
target = kids_ea["tol_eff_acc"]
for val in target:
    if (val <= mean):
        y.append(0)
    else:
        y.append(1)

X = np.array(X)
y = np.array(y)

# split into training & testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

y_train = y_train.reshape(49, 1)
y_test = y_test.reshape(22, 1)

In [5]:
# GridSearch cross validation

# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear', 'rbf']}  
   
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, n_jobs=-1) 
   
# fitting the model for grid search
grid.fit(X_train, y_train.ravel())

# print best parameter after tuning 
print(grid.best_params_) 
grid_predictions = grid.predict(X_test) 
   
# print classification report 
print(classification_report(y_test, grid_predictions)) 

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.500 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.500 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.600 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.400 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.556 total time=   0.0s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.600 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.600 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.556 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.500 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear

In [6]:
# K-folds cross validation
# X_train = X[x_split[0]], X_test = X[x_split[1]], y_train = y[y_split[0]], y_test = y[y_split[1]]
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [7]:
# create k=10 models, train and test on k folds
avg_acc = 0
for x_split, y_split in zip(kfold.split(X), kfold.split(y)):
    # instantiate SVC w/ rbf kernel
    linear_svc = SVC(kernel = 'linear', C=1)

    # fit model on X_train and y_train
    linear_svc.fit(X[x_split[0]], y[y_split[0]])

    # make predictions on X_test
    linear_y_pred = linear_svc.predict(X[x_split[1]])

    # measure performance on y_test and y_pred
    print("Accuracy score %.2f" %metrics.accuracy_score(y[y_split[1]], linear_y_pred))
    avg_acc += metrics.accuracy_score(y[y_split[1]], linear_y_pred)

print("Average accuracy: ", round(avg_acc/10, 2))

Accuracy score 0.88
Accuracy score 0.86
Accuracy score 0.71
Accuracy score 0.86
Accuracy score 0.57
Accuracy score 0.57
Accuracy score 0.71
Accuracy score 0.71
Accuracy score 0.29
Accuracy score 0.43
Average accuracy:  0.66


In [9]:
# create k=10 models, train and test on k folds
avg_acc = 0
for x_split, y_split in zip(kfold.split(X), kfold.split(y)):
    # instantiate SVC w/ rbf kernel
    rbf_svc = SVC(kernel = 'rbf', C=0.1, gamma=1)

    # fit model on X_train and y_train
    rbf_svc.fit(X[x_split[0]], y[y_split[0]])

    # make predictions on X_test
    rbf_y_pred = rbf_svc.predict(X[x_split[1]])

    # measure performance on y_test and y_pred
    print("Accuracy score %.2f" %metrics.accuracy_score(y[y_split[1]], rbf_y_pred))
    avg_acc += metrics.accuracy_score(y[y_split[1]], rbf_y_pred)

print("Average accuracy: ", round(avg_acc/10, 2))

Accuracy score 0.62
Accuracy score 0.43
Accuracy score 0.43
Accuracy score 0.86
Accuracy score 0.57
Accuracy score 0.57
Accuracy score 0.71
Accuracy score 0.29
Accuracy score 0.29
Accuracy score 0.43
Average accuracy:  0.52


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b22c459f-a4e2-4762-abe1-419ec96c9360' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>