In [42]:
import numpy as np
import pandas as pd
import sklearn as sk
import convertDictionaries as cd

#import classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.svm import SVC

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, recall_score, precision_score

In [43]:
data = pd.read_csv("./finalfinalDataset.csv")

In [44]:
y = pd.read_csv('./output_tidy.csv').drop('Unnamed: 0', axis=1)

In [45]:
data.columns

Index(['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       'wnioskowania', 'wolicjonalny', 'wolitywny', 'zdarzeniowy', 'o1+',
       'o1-', 'o1o', 'o2+', 'o2-', 'o2o'],
      dtype='object', length=325)

## Create test and train indices

In [46]:
train_inds, test_inds = next(GroupShuffleSplit(test_size=.2, n_splits=2, random_state=44).split(data, groups=data['verb']))

In [47]:
data = data.drop(["index", 'Unnamed: 0', 'verb'], axis=1)

In [48]:
y_onehot = data.iloc[:, -6:]

In [49]:
a1 = y_onehot.iloc[:,:3].values.argmax(1).astype(str)
a2 = y_onehot.iloc[:,3:].values.argmax(1).astype(str)

In [50]:
labels = np.core.defchararray.add(a1, a2)

In [51]:
X = data.iloc[:, :-6]

In [52]:
X_train = X.iloc[train_inds] 
X_test = X.iloc[test_inds] 
y_train = y_onehot.iloc[train_inds]  
y_test = y_onehot.iloc[test_inds] 
l_train = labels[train_inds]  
l_test = labels[test_inds] 

## Create estimators

In [53]:
estimators_parameters = {
#     DecisionTreeClassifier() : [{
#         "criterion": ["gini", "entropy"], 
#         "splitter": ["best", "random"],
#         "min_samples_split": [2, 4, 6],
#         "max_depth": [None, 10, 20]
#     }],
    
    KNeighborsClassifier() : [{
        "n_neighbors": np.arange(2, 10)
    }],
    
    RandomForestClassifier() : [{
        "n_estimators" : range(100, 1000, 10),
        "criterion" : ["gini", "entropy"],
        "max_features" : ["auto", "sqrt", "log2"]
    }],
   
    MLPClassifier() : [{
        "activation" : ["identity", "logistic", "tanh", "relu"],
        "solver" : ["lbfgs", "sgd", "adam"],
        "alpha" : [0.0001, 0.0005, 0.00001, 0.001],
        "learning_rate" : ["constant", "invscaling", "adaptive"]
    }],
    
    AdaBoostClassifier() : [{
        "n_estimators": np.arange(100,1000,10)
    }],
    
    GaussianProcessClassifier() : [{
        
    }],
    
     GradientBoostingClassifier() : [{        
        "n_estimators": np.arange(100,1000,10),
        "max_depth": np.arange(3,10)
    }],
    
     SVC() : [{
        "C": np.linspace(0.01,10,10),
        "gamma": np.linspace(1/2/316, 2/316, 10)
    }]
 }

estimators = estimators_parameters.keys()

## Checking estimators' performance with parameter tuning

In [54]:
np.unique(l_train)

array(['00', '01', '02', '10', '11', '12', '20', '22'], dtype='<U42')

In [55]:
np.unique(l_test)

array(['00', '02', '12', '20', '22'], dtype='<U42')

In [56]:
scorers_array = {
    'f1_score': make_scorer(f1_score, average="weighted")
}

for estimator in estimators:
    print(str(estimator.__class__.__name__))    
    
    search = GridSearchCV(estimator, estimators_parameters[estimator], scoring = scorers_array, return_train_score=True, n_jobs=-1, refit='f1_score')
    search.fit(X_train, l_train)
    
    print("Best search parameters")
    print(search.best_params_)
    print(search.best_score_)
    
    prediction = search.predict(X_test)
    
    classNames = data.columns[-6:].to_numpy()
    precision = precision_score(l_test, prediction, average="weighted")
    recall = recall_score(l_test, prediction, average="weighted")
    f1 = f1_score(l_test, prediction, average="weighted")
    
    print()
    
    print("Class names: {0}".format(classNames))
    print("Precision: {0}".format(precision))
    print("Recall: {0}".format(recall))
    print("F1: {0}".format(f1))
    
    print()
    print()
    print("-----------------------------------------------")
    print()

KNeighborsClassifier




Best search parameters
{'n_neighbors': 3}
0.7201670409913806

Class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
Precision: 0.7755544840887174
Recall: 0.7950819672131147
F1: 0.7817564948712489


RandomForestClassifier


  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 