In [3]:
import numpy as np
import pandas as pd
import sklearn as sk

#import classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.svm import SVC

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, recall_score, precision_score

# Wczytanie danych

In [4]:
data = pd.read_csv("./finalfinalDataset.csv")

In [5]:
y = pd.read_csv('./output_tidy.csv').drop('Unnamed: 0', axis=1)

In [6]:
data.columns

Index(['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       'wnioskowania', 'wolicjonalny', 'wolitywny', 'zdarzeniowy', 'o1+',
       'o1-', 'o1o', 'o2+', 'o2-', 'o2o'],
      dtype='object', length=325)

## Podział zbioru na dane testowe i treningowe

In [7]:
train_inds, test_inds = next(GroupShuffleSplit(test_size=.2, n_splits=2, random_state=44).split(data, groups=data['verb']))

In [8]:
data = data.drop(["index", 'Unnamed: 0', 'verb'], axis=1)

In [9]:
y_onehot = data.iloc[:, -6:]

In [10]:
a1 = y_onehot.iloc[:,:3].values.argmax(1).astype(str)
a2 = y_onehot.iloc[:,3:].values.argmax(1).astype(str)

In [11]:
labels = np.core.defchararray.add(a1, a2)

In [12]:
X = data.iloc[:, :-6]

In [13]:
X_train = X.iloc[train_inds] 
X_test = X.iloc[test_inds] 
y_train = y_onehot.iloc[train_inds]  
y_test = y_onehot.iloc[test_inds] 
l_train = labels[train_inds]  
l_test = labels[test_inds] 

## Definicja zakresu poszukiwań parametrów klasyfikatorów

In [14]:
estimators_parameters = {
    
    KNeighborsClassifier() : [{
        "n_neighbors": np.arange(2, 10)
    }],
    
    RandomForestClassifier() : [{
        "n_estimators" : range(100, 1000, 10),
        "criterion" : ["gini", "entropy"],
        "max_features" : ["auto", "sqrt", "log2"]
    }],
   
    MLPClassifier() : [{
        "activation" : ["identity", "logistic", "tanh", "relu"],
        "solver" : ["lbfgs", "sgd", "adam"],
        "alpha" : [0.0001, 0.0005, 0.00001, 0.001],
        "learning_rate" : ["constant", "invscaling", "adaptive"]
    }],
    
    AdaBoostClassifier() : [{
        "n_estimators": np.arange(100,1000,10)
    }],
    
    GaussianProcessClassifier() : [{
        
    }],
    
     GradientBoostingClassifier() : [{        
        "n_estimators": np.arange(100,1000,10),
        "max_depth": np.arange(3,10)
    }],
    
     SVC() : [{
        "C": np.linspace(0.01,10,10),
        "gamma": np.linspace(1/2/316, 2/316, 10)
    }]
 }

estimators = estimators_parameters.keys()

## Test wydajności klasyfikatorów

In [15]:
np.unique(l_train)

array(['00', '01', '02', '10', '11', '12', '20', '22'], dtype='<U42')

In [16]:
np.unique(l_test)

array(['00', '02', '12', '20', '22'], dtype='<U42')

In [54]:
scorers_array = {
    'f1_score': make_scorer(f1_score, average="weighted")
}

for estimator in estimators:
    print(str(estimator.__class__.__name__))    
    
    search = GridSearchCV(estimator, estimators_parameters[estimator], scoring = scorers_array, return_train_score=True, n_jobs=-1, refit='f1_score')
    search.fit(X_train, l_train)
    
    print("Best search parameters")
    print(search.best_params_)
    print(search.best_score_)
    
    prediction = search.predict(X_test)
    
    classNames = data.columns[-6:].to_numpy()
    precision = precision_score(l_test, prediction, average="weighted")
    recall = recall_score(l_test, prediction, average="weighted")
    f1 = f1_score(l_test, prediction, average="weighted")
    
    print()
    
    print("Class names: {0}".format(classNames))
    print("Precision: {0}".format(precision))
    print("Recall: {0}".format(recall))
    print("F1: {0}".format(f1))
    
    print()
    print()
    print("-----------------------------------------------")
    print()

KNeighborsClassifier




Best search parameters
{'n_neighbors': 3}
0.7201670409913806

Class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
Precision: 0.7755544840887174
Recall: 0.7950819672131147
F1: 0.7817564948712489


-----------------------------------------------

RandomForestClassifier


  _warn_prf(average, modifier, msg_start, len(result))


Best search parameters
{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 110}
0.7013757733282977

Class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
Precision: 0.5990349586661062
Recall: 0.6967213114754098
F1: 0.6319223269882839


-----------------------------------------------

MLPClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best search parameters
{'activation': 'relu', 'alpha': 0.001, 'learning_rate': 'invscaling', 'solver': 'adam'}
0.8227762078745607

Class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
Precision: 0.8264993674123126
Recall: 0.860655737704918
F1: 0.8394059412802202


-----------------------------------------------

AdaBoostClassifier




Best search parameters
{'n_estimators': 100}
0.5034538937052059

Class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
Precision: 0.4227746917761821
Recall: 0.6475409836065574
F1: 0.5115573770491804


-----------------------------------------------

GaussianProcessClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best search parameters
{}
0.7457048570551408


  _warn_prf(average, modifier, msg_start, len(result))



Class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
Precision: 0.7663630843958713
Recall: 0.7950819672131147
F1: 0.7801537709689279


-----------------------------------------------

GradientBoostingClassifier




Best search parameters
{'max_depth': 8, 'n_estimators': 830}
0.7632115419930482

Class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
Precision: 0.7471489665003563
Recall: 0.7950819672131147
F1: 0.7568306010928961


-----------------------------------------------

SVC


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best search parameters
{'C': 7.78, 'gamma': 0.006329113924050633}
0.8044078804158625

Class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
Precision: 0.810566906242429
Recall: 0.8442622950819673
F1: 0.8232637192428038


-----------------------------------------------



  _warn_prf(average, modifier, msg_start, len(result))
