In [8]:
import numpy as np
import pandas as pd
import sklearn as sk

from sklearn.model_selection import GridSearchCV

#import classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score

# Load data

In [18]:
output_columns = ["o1+", "o1-", "o1o", "o2+", "o2-", "o2o"]
invalid_columns = ["index", "verb"]
final_dataset = pd.read_csv("finalfinalDataset.csv")

In [23]:
input_data = final_dataset[final_dataset.columns[~final_dataset.columns.isin(output_columns + invalid_columns)]]

In [24]:
input_data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,mówienia,nie_wiedzowy,odkrycia,pamięciowy,percepcyjny,przyczynowy,wnioskowania,wolicjonalny,wolitywny,zdarzeniowy
0,0,0.34724,-0.33117,-0.1759,-0.065857,0.18662,-0.036958,-0.19162,0.062237,-0.12319,...,1,0,0,0,0,0,1,0,0,0
1,1,0.34724,-0.33117,-0.1759,-0.065857,0.18662,-0.036958,-0.19162,0.062237,-0.12319,...,1,0,0,0,0,0,1,0,0,0
2,2,0.26168,-0.71447,-0.28251,-0.35431,0.022618,-0.096973,-0.62558,0.26868,-0.29906,...,1,0,0,0,0,0,1,0,0,0
3,3,0.26168,-0.71447,-0.28251,-0.35431,0.022618,-0.096973,-0.62558,0.26868,-0.29906,...,1,0,0,0,0,0,1,0,0,0
4,4,0.214458,0.148894,-0.367293,-1.20748,0.44667,0.53854,-0.320619,0.12364,0.203019,...,0,0,0,0,0,0,0,1,0,0


In [26]:
output_data = final_dataset[output_columns]

In [27]:
output_data.head()

Unnamed: 0,o1+,o1-,o1o,o2+,o2-,o2o
0,1,0,0,0,0,1
1,1,0,0,0,0,1
2,0,0,1,0,0,1
3,0,0,1,0,0,1
4,0,0,1,0,0,1


# Initialize search parameters

In [28]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
]

# Initialize classifiers

In [29]:
estimator_array = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    GradientBoostingClassifier()
]

In [30]:
def checkEstimator(estimator, inData, outData, cvVal = 5):
    
    estimator.fit(inData, outData)
    predictedData = estimator.predict(inData)
    
    AS = accuracy_score(outData, predictedData)
    CV = cross_val_score(estimator, inData, outData, cv = cvVal) 
    BAS = balanced_accuracy_score(outData, predictedData)
    F1 = f1_score(outData, predictedData, average='macro')
       
    
    print(str(estimator.__class__.__name__))
    print('accuracy score - ' + str(AS))
    print('cross validation score - ' + str(CV))
    print('balanced accuracy score - ' + str(BAS))
    print('F1 score - ' + str(F1))
    print('')

In [31]:
checkEstimator(KNeighborsClassifier(3), input_data, output_data)

ValueError: multilabel-indicator is not supported

# Perform search

In [35]:
scoring_array = {
    make_scorer(accuracy_score)
}

In [36]:
X = input_data
y = output_data

for estimator in estimator_array:
    
    search = GridSearchCV(estimator, param_grid, scoring = scoring_array, n_jobs = -1)
    search.fit(X, y)

ValueError: The list/tuple elements must be unique strings of predefined scorers. One or more of the elements were callables. Use a dict of score name mapped to the scorer callable. Got '{make_scorer(accuracy_score)}'