In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import convertDictionaries as cd

#import classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score

# Check different classificators

Load data

In [2]:
data = pd.read_excel("LingFeatured NLI_PL_20.03.2020.xlsx")
data.head()

Unnamed: 0,id - pair,Deepl,czy była korekta translatora,T ENG,T PL,"GOLD <T,H>",T - type of sentence,text - number,verb - main semantic class,verb - second semantic class,...,presupposition,ewaluacja wynikania ES,ewaluacja wynikania ŁW,"ES <T1, H>","ŁW <T1, H>",anotator_ES,annotator_ŁW,annotator_JB,annotator_SF,Unnamed: 32
0,1203,"I said, ""I have a test, I'm gonna fail biology...",1,To which I said that I was to write a class te...,"Ja na to, że mam klasówkę, że zawalę biologię ...",N,eliptyczne,2531,mówienia,,...,nie dotyczy,,,,,N,E,?,N,
1,1671,Gliński did not have to turn his head to guess...,1,Gliński did not have to turn his head to guess...,Ani że na progu stanął właśnie Romanyczko.,?,eliptyczne,2978,epistemiczny,,...,no,E,E,E,E,,,,,
2,2441,Then he'll bring the Chinese back.,1,"Earlier Ukrainians were said to come and work,...","Potem, że się przywiezie Chińczyków.",N,eliptyczne,3489,mówienia,,...,nie dotyczy,,,,,,,,,
3,945,He got weaker and weaker and felt that it was ...,0,He got weaker and weaker and felt that it was ...,"Słabł coraz bardziej, czuł, że to kwestia zale...",N,1,2119,epistemiczny,percepcyjny,...,nie dotyczy,,,,,,,,,
4,947,"However, people who are very seriously ill, wh...",0,"However, people who are very seriously ill, wh...","Jednak ludzie bardzo ciężko chorzy, gdy już cz...",N,1,2123,epistemiczny,percepcyjny,...,nie dotyczy,,,,,N,N,E,E,


In [3]:
data.columns

Index(['id - pair', 'Deepl', 'czy była korekta translatora', 'T ENG', 'T PL',
       'GOLD <T,H>', 'T - type of sentence', 'text - number',
       'verb - main semantic class', 'verb - second semantic class',
       'verb - third semantic class', 'verb id', 'GOLD <T1,H>', 'WN', 'verb',
       'verb - veridical (positive enviroment)',
       'verb - veridical (negative enviroment)', 'T1', 'H', 'verb - tense',
       'realizacja predykatów', 'T - negation', 'complement - tense',
       'presupposition', 'ewaluacja wynikania ES', 'ewaluacja wynikania ŁW',
       'ES <T1, H>', 'ŁW <T1, H>', 'anotator_ES', 'annotator_ŁW',
       'annotator_JB', 'annotator_SF', 'Unnamed: 32'],
      dtype='object')

In [4]:
dataUnique = data[['verb - main semantic class', 'verb - second semantic class',
       'verb - third semantic class', 'verb - tense','verb','verb - veridical (positive enviroment)',
                             'verb - veridical (negative enviroment)']].drop_duplicates()

verbAtt = dataUnique[['verb - main semantic class', 'verb - second semantic class',
       'verb - third semantic class', 'verb - tense']]

verbSig = dataUnique[['verb - veridical (positive enviroment)', 'verb - veridical (negative enviroment)']]

Define estimators

In [5]:
estimators = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    GradientBoostingClassifier()
]

In [6]:
def checkEstimator(estimator, inData, outData, cvVal = 5):
    
    estimator.fit(inData, outData)
    predictedData = estimator.predict(inData)
    
    AS = accuracy_score(outData, predictedData)
    CV = cross_val_score(estimator, inData, outData, cv = cvVal) 
    BAS = balanced_accuracy_score(outData, predictedData)
    F1 = f1_score(outData, predictedData, average='macro')
       
    
    print(str(estimator.__class__.__name__))
    print('accuracy score - ' + str(AS))
    print('cross validation score - ' + str(CV))
    print('balanced accuracy score - ' + str(BAS))
    print('F1 score - ' + str(F1))
    print('')

Represent as simple enums

In [7]:
verbAttEnum = verbAtt.copy()

verbAttEnum['verb - main semantic class'] = verbAttEnum['verb - main semantic class'].apply(cd.convertData, args=[cd.semanticClass])
verbAttEnum['verb - second semantic class'] = verbAttEnum['verb - second semantic class'].apply(cd.convertData, args=[cd.semanticClass])
verbAttEnum['verb - third semantic class'] = verbAttEnum['verb - third semantic class'].apply(cd.convertData, args=[cd.semanticClass])
verbAttEnum['verb - tense'] = verbAttEnum['verb - tense'].apply(cd.convertData, args=[cd.verbTense])

verbSigEnum = verbSig.copy()

verbSigEnum['verb - veridical (positive enviroment)'] = verbSigEnum['verb - veridical (positive enviroment)'].apply(cd.convertData,args=[cd.verbVeridical])
verbSigEnum['verb - veridical (negative enviroment)'] = verbSigEnum['verb - veridical (negative enviroment)'].apply(cd.convertData,args=[cd.verbVeridical])

In [8]:
for est in estimators:
    checkEstimator(est, verbAttEnum, verbSigEnum['verb - veridical (positive enviroment)'])



KNeighborsClassifier
accuracy score - 0.7727272727272727
cross validation score - [0.72580645 0.71544715 0.68292683 0.73170732 0.69105691]
balanced accuracy score - 0.4932748953644476
F1 score - 0.48975286132487844

DecisionTreeClassifier
accuracy score - 0.7613636363636364
cross validation score - [0.71774194 0.77235772 0.72357724 0.76422764 0.67479675]
balanced accuracy score - 0.34868988391376454
F1 score - 0.3750329392802274

RandomForestClassifier
accuracy score - 0.7775974025974026
cross validation score - [0.70967742 0.76422764 0.65853659 0.76422764 0.6097561 ]
balanced accuracy score - 0.3368309247413725
F1 score - 0.3558836405924001





MLPClassifier
accuracy score - 0.6655844155844156
cross validation score - [0.69354839 0.63414634 0.66666667 0.7398374  0.62601626]
balanced accuracy score - 0.24535260206902
F1 score - 0.23761145231078418





AdaBoostClassifier
accuracy score - 0.6737012987012987
cross validation score - [0.66935484 0.60162602 0.66666667 0.64227642 0.57723577]
balanced accuracy score - 0.2893042722893469
F1 score - 0.2948771008745596





GaussianProcessClassifier
accuracy score - 0.7987012987012987
cross validation score - [0.72580645 0.7804878  0.70731707 0.77235772 0.70731707]
balanced accuracy score - 0.3180920792861091
F1 score - 0.3180460956312349





GradientBoostingClassifier
accuracy score - 0.8003246753246753
cross validation score - [0.70967742 0.78861789 0.71544715 0.78861789 0.69918699]
balanced accuracy score - 0.3939856274184632
F1 score - 0.43962035274005923



In [9]:
for est in estimators:
    checkEstimator(est, verbAttEnum, verbSigEnum['verb - veridical (negative enviroment)'])

KNeighborsClassifier
accuracy score - 0.7207792207792207
cross validation score - [0.49193548 0.67479675 0.68292683 0.69918699 0.69918699]
balanced accuracy score - 0.5588086280991242
F1 score - 0.471490952731559

DecisionTreeClassifier
accuracy score - 0.7386363636363636
cross validation score - [0.60483871 0.73170732 0.7398374  0.69918699 0.65853659]
balanced accuracy score - 0.4384024291154608
F1 score - 0.40271133288086636

RandomForestClassifier
accuracy score - 0.7435064935064936
cross validation score - [0.66935484 0.72357724 0.66666667 0.77235772 0.63414634]
balanced accuracy score - 0.30662986227440403
F1 score - 0.3214440260900608

MLPClassifier
accuracy score - 0.6558441558441559
cross validation score - [0.62096774 0.65853659 0.67479675 0.72357724 0.62601626]
balanced accuracy score - 0.21344260289890524
F1 score - 0.18409638554216867

AdaBoostClassifier
accuracy score - 0.6396103896103896
cross validation score - [0.63709677 0.63414634 0.64227642 0.64227642 0.64227642]
bal