In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import convertDictionaries as cd
from sklearn.model_selection import cross_val_score

#import classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Check representation of data

Load data

In [2]:
data = pd.read_excel("LingFeatured NLI_PL_20.03.2020.xlsx")
data.head()

Unnamed: 0,id - pair,Deepl,czy była korekta translatora,T ENG,T PL,"GOLD <T,H>",T - type of sentence,text - number,verb - main semantic class,verb - second semantic class,...,presupposition,ewaluacja wynikania ES,ewaluacja wynikania ŁW,"ES <T1, H>","ŁW <T1, H>",anotator_ES,annotator_ŁW,annotator_JB,annotator_SF,Unnamed: 32
0,1203,"I said, ""I have a test, I'm gonna fail biology...",1,To which I said that I was to write a class te...,"Ja na to, że mam klasówkę, że zawalę biologię ...",N,eliptyczne,2531,mówienia,,...,nie dotyczy,,,,,N,E,?,N,
1,1671,Gliński did not have to turn his head to guess...,1,Gliński did not have to turn his head to guess...,Ani że na progu stanął właśnie Romanyczko.,?,eliptyczne,2978,epistemiczny,,...,no,E,E,E,E,,,,,
2,2441,Then he'll bring the Chinese back.,1,"Earlier Ukrainians were said to come and work,...","Potem, że się przywiezie Chińczyków.",N,eliptyczne,3489,mówienia,,...,nie dotyczy,,,,,,,,,
3,945,He got weaker and weaker and felt that it was ...,0,He got weaker and weaker and felt that it was ...,"Słabł coraz bardziej, czuł, że to kwestia zale...",N,1,2119,epistemiczny,percepcyjny,...,nie dotyczy,,,,,,,,,
4,947,"However, people who are very seriously ill, wh...",0,"However, people who are very seriously ill, wh...","Jednak ludzie bardzo ciężko chorzy, gdy już cz...",N,1,2123,epistemiczny,percepcyjny,...,nie dotyczy,,,,,N,N,E,E,


In [3]:
data.columns

Index(['id - pair', 'Deepl', 'czy była korekta translatora', 'T ENG', 'T PL',
       'GOLD <T,H>', 'T - type of sentence', 'text - number',
       'verb - main semantic class', 'verb - second semantic class',
       'verb - third semantic class', 'verb id', 'GOLD <T1,H>', 'WN', 'verb',
       'verb - veridical (positive enviroment)',
       'verb - veridical (negative enviroment)', 'T1', 'H', 'verb - tense',
       'realizacja predykatów', 'T - negation', 'complement - tense',
       'presupposition', 'ewaluacja wynikania ES', 'ewaluacja wynikania ŁW',
       'ES <T1, H>', 'ŁW <T1, H>', 'anotator_ES', 'annotator_ŁW',
       'annotator_JB', 'annotator_SF', 'Unnamed: 32'],
      dtype='object')

In [4]:
verbAtt = data[['verb - main semantic class', 'verb - second semantic class',
       'verb - third semantic class', 'verb - tense']]

verbSig = data[['verb - veridical (positive enviroment)', 'verb - veridical (negative enviroment)']]

Define estimators

In [5]:
estimators = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
]

In [20]:
def checkEstimator(estimator, inData, outData, cvVal = 5):
    estimator.fit(inData, outData)
    simpleScore = estimator.score(inData, outData)
    cvScore = cross_val_score(estimator, inData, outData, cv = cvVal)    
    print(str(estimator.__class__.__name__))
    print('simple score - ' + str(simpleScore))
    print('cross validation score - ' + str(cvScore))
    print('')

Represent as hot ones

In [7]:
verbAttHotOnes = pd.get_dummies(verbAtt)
verbSigHotOnes = pd.get_dummies(verbSig)

In [21]:
for est in estimators:
    checkEstimator(est, verbAttHotOnes, verbSigHotOnes)

KNeighborsClassifier
simple score - 0.762326656394453
cross validation score - [0.71730769 0.69749518 0.79576108 0.52986513 0.59344894]

DecisionTreeClassifier
simple score - 0.7272727272727273
cross validation score - [0.71153846 0.64354528 0.78420039 0.53371869 0.63391137]

RandomForestClassifier
simple score - 0.7014637904468413
cross validation score - [0.74230769 0.67052023 0.80346821 0.5433526  0.56840077]

MLPClassifier
simple score - 0.7661787365177196
cross validation score - [0.74615385 0.70327553 0.7938343  0.53564547 0.5973025 ]



Represent as simple enums

In [25]:
verbAttEnum = verbAtt.copy()

verbAttEnum['verb - main semantic class'] = verbAttEnum['verb - main semantic class'].apply(cd.convertData, args=[cd.semanticClass])
verbAttEnum['verb - second semantic class'] = verbAttEnum['verb - second semantic class'].apply(cd.convertData, args=[cd.semanticClass])
verbAttEnum['verb - third semantic class'] = verbAttEnum['verb - third semantic class'].apply(cd.convertData, args=[cd.semanticClass])
verbAttEnum['verb - tense'] = verbAttEnum['verb - tense'].apply(cd.convertData, args=[cd.verbTense])

verbSigEnum = verbSig.copy()

verbSigEnum['verb - veridical (positive enviroment)'] = verbSigEnum['verb - veridical (positive enviroment)'].apply(cd.convertData,args=[cd.verbVeridical])
verbSigEnum['verb - veridical (negative enviroment)'] = verbSigEnum['verb - veridical (negative enviroment)'].apply(cd.convertData,args=[cd.verbVeridical])

In [26]:
for est in estimators:
    checkEstimator(est, verbAttEnum, verbSigEnum['verb - veridical (positive enviroment)'])



KNeighborsClassifier
simple score - 0.8151001540832049
cross validation score - [0.71153846 0.86319846 0.77649326 0.72254335 0.73025048]

DecisionTreeClassifier
simple score - 0.7966101694915254
cross validation score - [0.75769231 0.90751445 0.80346821 0.68208092 0.73410405]

RandomForestClassifier
simple score - 0.7969953775038521
cross validation score - [0.74423077 0.91522158 0.7283237  0.67052023 0.63776493]





MLPClassifier
simple score - 0.7796610169491526
cross validation score - [0.79038462 0.6628131  0.72639692 0.69364162 0.63198459]



In [27]:
for est in estimators:
    checkEstimator(est, verbAttEnum, verbSigEnum['verb - veridical (negative enviroment)'])

KNeighborsClassifier
simple score - 0.7927580893682589
cross validation score - [0.73461538 0.73795761 0.7283237  0.72254335 0.71868979]

DecisionTreeClassifier
simple score - 0.7854391371340523
cross validation score - [0.73653846 0.74181118 0.76685934 0.73603083 0.73025048]

RandomForestClassifier
simple score - 0.7469183359013868
cross validation score - [0.76346154 0.75722543 0.73795761 0.75144509 0.61849711]

MLPClassifier
simple score - 0.7588597842835131
cross validation score - [0.73076923 0.72639692 0.70327553 0.761079   0.61464355]

