In [2]:
import numpy as np
import pandas as pd
import sklearn as sk
import convertDictionaries as cd

#import classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupShuffleSplit
from sklearn.svm import SVC

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, recall_score, precision_score

In [3]:
data = pd.read_csv("./finalfinalDataset.csv")

In [4]:
y = pd.read_csv('./output_tidy.csv').drop('Unnamed: 0', axis=1)

In [5]:
data.columns

Index(['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       'wnioskowania', 'wolicjonalny', 'wolitywny', 'zdarzeniowy', 'o1+',
       'o1-', 'o1o', 'o2+', 'o2-', 'o2o'],
      dtype='object', length=325)

## Create test and train indices

In [6]:
train_inds, test_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state=42).split(data, groups=data['verb']))

In [7]:
data = data.drop(["index", 'Unnamed: 0', 'verb'], axis=1)

In [8]:
y_onehot = data.iloc[:, -6:]

In [9]:
X = data.iloc[:, :-6]

In [10]:
X_train = X.iloc[train_inds] 
X_test = X.iloc[test_inds] 
y_train = y_onehot.iloc[train_inds]  
y_test = y_onehot.iloc[test_inds] 

## Create estimators

In [11]:
estimators = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(), 
    MLPClassifier(),
    
    #te rzucaja bledem
    #AdaBoostClassifier(),
    #GaussianProcessClassifier(),
    #GradientBoostingClassifier(),
    #SVC(kernel="linear"),
    #SVC(kernel="rbf")
]

## Checking estimators' performance

In [12]:
def check_estimator(estimator):
    
    print()
    print()
    print(str(estimator.__class__.__name__))
    
    #cross_val_score(estimator, X, y_onehot, cv=5)
    
    model = estimator.fit(X_train, y_train)
    model.score(X_test, y_test)
    
    prediction = model.predict(X_test)
    
    classNames = data.columns[-6:].to_numpy()
    precision = precision_score(y_test.to_numpy(),prediction, average="samples")
    recall = recall_score(y_test.to_numpy(),prediction, average="samples")
    f1 = f1_score(y_test.to_numpy(),prediction, average="samples")
    
    print("class names: {0}".format(classNames))
    print("precision: {0}".format(precision))
    print("recall: {0}".format(recall))
    print("F1: {0}".format(f1))
    

In [13]:
for estimator in estimators:
    check_estimator(estimator)
    
    
    



KNeighborsClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.7435897435897436
recall: 0.7393162393162394
F1: 0.7407407407407407


DecisionTreeClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.7222222222222222
recall: 0.7222222222222222
F1: 0.7222222222222222


RandomForestClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.7735042735042735
recall: 0.7606837606837606
F1: 0.7649572649572649


MLPClassifier
class names: ['o1+' 'o1-' 'o1o' 'o2+' 'o2-' 'o2o']
precision: 0.8205128205128205
recall: 0.8034188034188035
F1: 0.809116809116809




## Tuning parameters

In [33]:
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

param_grid_knn = [
    {"n_neighbors": [1, 2, 3, 4, 5, 10]}
]

param_grid_dec_tree = [
    {"criterion": ["gini", "entropy"], "splitter": ["best", "random"], "min_samples_split": [2, 4, 6], "max_depth": [None, 10, 20]}
]

scorers_array = {
    'precision_score': make_scorer(precision_score, average="samples"),
    'recall_score': make_scorer(recall_score, average="samples"),
}

search = GridSearchCV(KNeighborsClassifier(), param_grid_knn, scoring = scorers_array, return_train_score=True, n_jobs=-1, refit='precision_score')
search.fit(X_train, y_train)
print(search.best_params_)

search = GridSearchCV(DecisionTreeClassifier(), param_grid_dec_tree, scoring = scorers_array, return_train_score=True, n_jobs=-1, refit='precision_score')
search.fit(X_train, y_train)
print(search.best_params_)

{'n_neighbors': 1}
{'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 2, 'splitter': 'random'}


GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid=...n_jobs=-1,
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'm