In [123]:
import numpy as np
import pandas as pd
import sklearn as sk
import convertDictionaries as cd

#import classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.svm import SVC

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, recall_score, precision_score

In [124]:
data = pd.read_csv("./finalfinalDataset.csv")

In [125]:
y = pd.read_csv('./output_tidy.csv').drop('Unnamed: 0', axis=1)

In [126]:
data.columns

Index(['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       'wnioskowania', 'wolicjonalny', 'wolitywny', 'zdarzeniowy', 'o1+',
       'o1-', 'o1o', 'o2+', 'o2-', 'o2o'],
      dtype='object', length=325)

## Create test and train indices

In [127]:
train_inds, test_inds = next(GroupShuffleSplit(test_size=.2, n_splits=2, random_state=44).split(data, groups=data['verb']))

In [128]:
data = data.drop(["index", 'Unnamed: 0', 'verb'], axis=1)

In [129]:
y_onehot = data.iloc[:, -6:]

In [130]:
a1 = y_onehot.iloc[:,:3].values.argmax(1).astype(str)
a2 = y_onehot.iloc[:,3:].values.argmax(1).astype(str)

In [131]:
labels = np.core.defchararray.add(a1, a2)

In [132]:
X = data.iloc[:, :-6]

In [133]:
X_train = X.iloc[train_inds] 
X_test = X.iloc[test_inds] 
y_train = y_onehot.iloc[train_inds]  
y_test = y_onehot.iloc[test_inds] 
l_train = labels[train_inds]  
l_test = labels[test_inds] 

## Create estimators

In [139]:
estimators = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(), 
    MLPClassifier(),
    
    #te rzucaja bledem
    AdaBoostClassifier(),
    GaussianProcessClassifier(),
    GradientBoostingClassifier(),
    SVC(kernel="linear"),
    SVC(kernel="rbf")
]

## Checking estimators' performance

In [140]:
np.unique(l_train)

array(['00', '01', '02', '10', '11', '12', '20', '22'], dtype='<U42')

In [141]:
np.unique(l_test)

array(['00', '02', '12', '20', '22'], dtype='<U42')

In [148]:
def check_estimator_2(estimator):
    
    print('\n\n', str(estimator.__class__.__name__))
    
    # print(cross_val_score(estimator, X, y_onehot, cv=5))
    
    model = estimator.fit(X_train, l_train)
    model.score(X_test, l_test)
    
    prediction = model.predict(X_test)
    
    classNames = data.columns[-6:].to_numpy()
    precision = precision_score(l_test,prediction, average='weighted')
    recall = recall_score(l_test,prediction, average='weighted')
    f1 = f1_score(l_test,prediction, average='weighted')
    
    print("class names: {0}".format(classNames))
    print("precision: {0}".format(precision))
    print("recall: {0}".format(recall))
    print("F1: {0}".format(f1))
    

In [149]:
for estimator in estimators:
    check_estimator_2(estimator)



 KNeighborsClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.7480135873578496
recall: 0.7540983606557377
F1: 0.7468573186369559


 DecisionTreeClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.5127741111347669
recall: 0.48360655737704916
F1: 0.4953706776398752


 RandomForestClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.6631856872635561
recall: 0.7213114754098361
F1: 0.6688405147959032


 MLPClassifier


  _warn_prf(average, modifier, msg_start, len(result))


class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.8264993674123126
recall: 0.860655737704918
F1: 0.8394059412802202


 AdaBoostClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.4227746917761821
recall: 0.6475409836065574
F1: 0.5115573770491804


 GaussianProcessClassifier


  _warn_prf(average, modifier, msg_start, len(result))


class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.7663630843958713
recall: 0.7950819672131147
F1: 0.7801537709689279


 GradientBoostingClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.7786085920241832
recall: 0.7950819672131147
F1: 0.7674007868041934


 SVC
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.7909198369987868
recall: 0.819672131147541
F1: 0.8011506508507708


 SVC
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.8125753117556396
recall: 0.8442622950819673
F1: 0.8222116401183236


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [150]:
def check_estimator(estimator):
    
    print()
    print()
    print(str(estimator.__class__.__name__))
    
    # print(cross_val_score(estimator, X, y_onehot, cv=5))
    
    model = estimator.fit(X_train, y_train)
    model.score(X_test, y_test)
    
    prediction = model.predict(X_test)
    
    classNames = data.columns[-6:].to_numpy()
    precision = precision_score(y_test.to_numpy(),prediction, average="samples")
    recall = recall_score(y_test.to_numpy(),prediction, average="samples")
    f1 = f1_score(y_test.to_numpy(),prediction, average="samples")
    
    print("class names: {0}".format(classNames))
    print("precision: {0}".format(precision))
    print("recall: {0}".format(recall))
    print("F1: {0}".format(f1))
    

In [151]:
for estimator in estimators:
    check_estimator(estimator)



KNeighborsClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.8237704918032787
recall: 0.8237704918032787
F1: 0.8237704918032787


DecisionTreeClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.6065573770491803
recall: 0.6065573770491803
F1: 0.6065573770491803


RandomForestClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.8483606557377049
recall: 0.819672131147541
F1: 0.8292349726775957


MLPClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.8647540983606558
recall: 0.8442622950819673
F1: 0.8510928961748634


AdaBoostClassifier




ValueError: bad input shape (449, 6)

## Tuning parameters

In [13]:
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

param_grid_knn = [
    {"n_neighbors": [1, 2, 3, 4, 5, 10]}
]

param_grid_dec_tree = [
    {"criterion": ["gini", "entropy"], "splitter": ["best", "random"], "min_samples_split": [2, 4, 6], "max_depth": [None, 10, 20]}
]

scorers_array = {
    'precision_score': make_scorer(precision_score, average="samples"),
    'recall_score': make_scorer(recall_score, average="samples"),
}

search = GridSearchCV(KNeighborsClassifier(), param_grid_knn, scoring = scorers_array, return_train_score=True, n_jobs=-1, refit='precision_score')
search.fit(X_train, y_train)
print(search.best_params_)

search = GridSearchCV(DecisionTreeClassifier(), param_grid_dec_tree, scoring = scorers_array, return_train_score=True, n_jobs=-1, refit='precision_score')
search.fit(X_train, y_train)
print(search.best_params_)

{'n_neighbors': 1}
{'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 6, 'splitter': 'random'}
