In [1]:
embeddings_type = 'elmo' #w2v or elmo

## Loading embeddings

In [2]:
import pickle

if embeddings_type == 'w2v':
    folder = 'embeddings/W2V/'
    with open(folder + "trainw2v.pickle", "rb") as pickle_in:
        train_data = pickle.load(pickle_in)
    with open(folder + "devw2v.pickle", "rb") as pickle_in:
        dev_data = pickle.load(pickle_in)
    with open(folder + "testw2v.pickle", "rb") as pickle_in:
        test_data = pickle.load(pickle_in)
    with open(folder + "test_manual_binw2v.pickle", "rb") as pickle_in:
        test_manual_bin_data = pickle.load(pickle_in)
    with open(folder + "test_manual_multiw2v.pickle", "rb") as pickle_in:
        test_manual_multi_data = pickle.load(pickle_in)
elif embeddings_type == 'elmo':
    folder = 'embeddings/ELMo/'
    with open(folder + "trainelmo.pickle", "rb") as pickle_in:
        train_data = pickle.load(pickle_in)
    with open(folder + "develmo.pickle", "rb") as pickle_in:
        dev_data = pickle.load(pickle_in)
    with open(folder + "testelmo.pickle", "rb") as pickle_in:
        test_data = pickle.load(pickle_in)
    with open(folder + "test_manual_binelmo.pickle", "rb") as pickle_in:
        test_manual_bin_data = pickle.load(pickle_in)
    with open(folder + "test_manual_multielmo.pickle", "rb") as pickle_in:
        test_manual_multi_data = pickle.load(pickle_in)

## Loading dataframes

In [3]:
import pandas as pd
import numpy as np

names = ["OBJECT A", "OBJECT B", "ASPECT", "MOST FREQUENT RATING", "SENTENCE"]
df_train = pd.read_csv("classification_fine_grained/train_clf_fine_grained.csv", header=None, names=names)
df_test = pd.read_csv("classification_fine_grained/test_clf_fine_grained.csv", header=None, names=names)
df_dev = pd.read_csv("classification_fine_grained/dev_clf_fine_grained.csv", header=None, names=names)
df_test_manual_bin = pd.read_csv("classification_binary/test_manual_clf_binary.csv", header=None, names=names)
df_test_manual_multi = pd.read_csv("classification_fine_grained/test_manual_clf_fine_grained.csv", header=None, names=names)

df_train.shape

(3871, 5)

### Dropping duplicate object-aspect combinations  
ONLY RUN IF YOU ARE NOT GOING TO USE SENTENCES

In [4]:
# def drop_dups(data, df):
#     df.drop('SENTENCE', axis=1, inplace=True)
#     df.drop_duplicates(inplace=True)
#     new_data = []
#     for i in range(len(df.index)):
#         new_data.append(data[df.index[i]])
#     return new_data

# train_data = drop_dups(train_data, df_train)
# dev_data = drop_dups(dev_data, df_dev)
# test_data = drop_dups(test_data, df_test)
# len(train_data)

In [5]:
print("len train " + str(len(train_data)))
print("len dev " + str(len(dev_data)))
print("len test " + str(len(test_data)))
print("len test manual bin " + str(len(test_manual_bin_data)))
print("len test manual multi " + str(len(test_manual_multi_data)))

len train 3871
len dev 461
len test 608
len test manual bin 736
len test manual multi 757


In [6]:
token_limit = 25

under_limit = 0
for i in range(len(train_data)):
    if len(train_data[i][8]) <= token_limit:
        under_limit += 1
under_limit /= len(train_data)
print('sentences under token limit(' + str(token_limit) + '): ' + str(under_limit))

sentences under token limit(25): 0.0


### Filling empty embeddings with zeros (in cases w2v does not contain the words mentioned)

In [7]:
def fill_empty(data):
    embedding_dim = 300
    new_data = []
    for i in range(len(data)):
        for j in range(5, 9):
            if len(data[i][j]) == 0:
                data[i][j].append(np.zeros(embedding_dim))
        new_data.append(data[i])
    return new_data

train_data = fill_empty(train_data)
dev_data = fill_empty(dev_data)
test_data = fill_empty(test_data)
test_manual_bin_data = fill_empty(test_manual_bin_data)
test_manual_multi_data = fill_empty(test_manual_multi_data)
print("len train " + str(len(train_data)))
print("len dev " + str(len(dev_data)))
print("len test " + str(len(test_data)))
print("len test manual bin " + str(len(test_manual_bin_data)))
print("len test manual multi " + str(len(test_manual_multi_data)))

len train 3871
len dev 461
len test 608
len test manual bin 736
len test manual multi 757


### Deleting samples with no embeddings (another way to treat empty embeddings)

In [8]:
# def delete_empty(data):
#     new_data = []
#     for i in range(len(data)):
#         ok = True
#         for j in range(5, 9):
#             if len(data[i][j]) == 0:
#                 ok = False
#                 break
#         if ok:
#             new_data.append(data[i])
#     return new_data

# train_data = delete_empty(train_data)
# dev_data = delete_empty(dev_data)
# test_data = delete_empty(test_data)
# test_manual_bin_data = delete_empty(test_manual_bin_data)
# test_manual_multi_data = delete_empty(test_manual_multi_data)
# print("len train " + str(len(train_data)))
# print("len dev " + str(len(dev_data)))
# print("len test " + str(len(test_data)))
# print("len test manual bin " + str(len(test_manual_bin_data)))
# print("len test manual multi " + str(len(test_manual_multi_data)))

### Shuffling the train data

In [9]:
import random

random.shuffle(train_data)

In [10]:
def get_output_for_binary(data):
    out = []
#     new_data = []
    for i in range(len(data)):
#         new_data.append(data[i])
#         if random.choice([True, False]):
#             new_data[i][5] = data[i][6]
#             new_data[i][6] = data[i][5]
#             out.append(1)
#         else:
#             out.append(0)
        if data[i][3] == 'BAD':
            out.append(0)
        else:
            out.append(1)
    return out

y_bin_train = get_output_for_binary(train_data)
y_bin_dev = get_output_for_binary(dev_data)
y_bin_test = get_output_for_binary(test_data)
y_bin_test_manual = get_output_for_binary(test_manual_bin_data)

In [11]:
print("Samples number:")
print("train:\t" + str(len(y_bin_train)))
print("dev:\t" + str(len(y_bin_dev)))
print("test:\t" + str(len(y_bin_test)))
print("test manual:\t" + str(len(y_bin_test_manual)))

Samples number:
train:	3871
dev:	461
test:	608
test manual:	736


In [12]:
def get_output_for_multi(data):
    out = []
#     new_data = []
    for i in range(len(data)):
#         new_data.append(data[i])
#         if random.choice([True, False]):
#             new_data[i][5] = data[i][6]
#             new_data[i][6] = data[i][5]
#             out.append(1)
#         else:
#             out.append(0)
        if data[i][3] == 'BAD':
            out.append(0)
        elif data[i][3] == 'ASPECT':
            out.append(1)
        elif data[i][3] == 'PREDICATE-FULL':
            out.append(2)
        elif data[i][3] == 'PREDICATE-DEPENDENT':
            out.append(3)
    return out

y_multi_train = get_output_for_multi(train_data)
y_multi_dev = get_output_for_multi(dev_data)
y_multi_test = get_output_for_multi(test_data)
y_multi_test_manual = get_output_for_multi(test_manual_multi_data)

In [13]:
print("Samples number:")
print("train:\t" + str(len(y_multi_train)))
print("dev:\t" + str(len(y_multi_dev)))
print("test:\t" + str(len(y_multi_test)))
print("test manual:\t" + str(len(y_multi_test_manual)))

Samples number:
train:	3871
dev:	461
test:	608
test manual:	757


In [14]:
def get_avg_embedding(embeddings):
    avg = embeddings[0]
    for i in range(1, len(embeddings)):
        avg = [avg[j] + embeddings[i][j] for j in range(len(avg))]
    for i in range(len(avg)):
        avg[i] /= len(embeddings)
    return avg

def max_pooling(embeddings):
    max = embeddings[0]
    for i in range(1, len(embeddings)):
        for j in range(len(avg)):
            if max[j] < embeddings[i][j]:
                max[j] = embeddings[i][j]
    return max

def get_input_vectors(data):
    vectors = []
    max_tokens = 26
    for i in range(len(data)):
        if embeddings_type == 'w2v':
            object_a = get_avg_embedding(data[i][5])
            object_b = get_avg_embedding(data[i][6])
            aspect = get_avg_embedding(data[i][7])
            sentence = get_avg_embedding(data[i][8])
        elif embeddings_type == 'elmo':
            object_a = data[i][5]
            object_b = data[i][6]
            aspect = data[i][7]
            sentence = data[i][8]
#         if data[i][4].find(data[i][0]) > data[i][4].find(data[i][1]):
#             first_object = np.array([1], dtype='float32')
#         else:
#             first_object = np.array([0], dtype='float32')
#         if len(data[i][8]) < max_tokens:
#             sentence = np.concatenate(data[i][8])
#             to_add = max_tokens - len(data[i][8])
#             for j in range(to_add):
#                 sentence = np.concatenate((sentence, np.zeros(300)))
#         else:
#             sentence = np.concatenate(data[i][8][:max_tokens])
#         vectors.append(np.concatenate((object_a, object_b, aspect)))
        vectors.append(np.concatenate((object_a, object_b, aspect, sentence)))
#         vectors.append(np.array([object_a, object_b, aspect, sentence]).mean(axis=0))
#         vectors.append(np.concatenate((first_object, sentence)))
    return vectors

X_train = get_input_vectors(train_data)
X_dev = get_input_vectors(dev_data)
X_test = get_input_vectors(test_data)
X_test_manual_bin = get_input_vectors(test_manual_bin_data)
X_test_manual_multi = get_input_vectors(test_manual_multi_data)

In [15]:
X_train[0].shape

(4096,)

### Check classes balance

In [16]:
pd.Series(y_bin_train).value_counts(normalize=True)

0    0.532937
1    0.467063
dtype: float64

In [17]:
pd.Series(y_multi_train).value_counts(normalize=True)

0    0.532937
2    0.253940
3    0.126066
1    0.087058
dtype: float64

In [18]:
pd.Series(y_bin_test_manual).value_counts(normalize=True)

1    1.0
dtype: float64

In [19]:
pd.Series(y_multi_test_manual).value_counts(normalize=True)

2    0.587847
1    0.233818
3    0.178336
dtype: float64

## Models

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

def report_scores(model, X, y):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    pr = precision_score(y, y_pred, average='weighted')
    re = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    print("Accuracy: {:.2f}".format(acc * 100))
    print("Precision: {:.2f}".format(pr * 100))
    print("Recall: {:.2f}".format(re * 100))
    print("F1: {:.2f}".format(f1 * 100))
    
    
def report_scores_multi(model, X, y):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    pr = precision_score(y, y_pred, average='weighted')
    re = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    print("Accuracy: {:.2f}".format(acc * 100))
    print("Precision: {:.2f}".format(pr * 100))
    print("Recall: {:.2f}".format(re * 100))
    print("F1: {:.2f}".format(f1 * 100))
    target_names = ['BAD', 'ASPECT', 'PREDICATE-FULL', 'PREDICATE-DEPENDENT']
    print(classification_report(y, y_pred, target_names=target_names))

### Random forest

In [39]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, max_depth=9, min_samples_leaf=1)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 98.14
Precision: 98.18
Recall: 98.14
F1: 98.14
Dev
Accuracy: 85.25
Precision: 86.19
Recall: 85.25
F1: 85.25


In [40]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 82.73
Precision: 83.54
Recall: 82.73
F1: 82.63


In [41]:
# Evaluation
print("Test manual")
report_scores(model, X_test_manual_bin, y_bin_test_manual)

Test manual
Accuracy: 45.92
Precision: 100.00
Recall: 45.92
F1: 62.94


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [16]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

model = RandomForestClassifier()
# print(model.get_params().keys())
parameters_grid = {
    'n_estimators': [10, 20, 30, 50, 80, 100, 200],
    # 'min_samples_split': range(11, 17),
    'min_samples_leaf': range(1, 10, 2),
    'max_depth': range(3, 10)
    }

cv = StratifiedShuffleSplit(test_size=0.2, random_state=0)
grid_cv = GridSearchCV(model, parameters_grid, scoring='accuracy', cv=cv, n_jobs=4)

grid_cv.fit(X_train, y_bin_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              rand

In [17]:
grid_cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
grid_cv.best_score_

0.9583225806451613

In [20]:
grid_cv.best_params_

{'max_depth': 9, 'min_samples_leaf': 1, 'n_estimators': 200}

In [24]:
model = RandomForestClassifier(n_estimators=100, max_depth=9, min_samples_leaf=1)

print("start of fit")
model.fit(X_train, y_multi_train)

# Evaluation
print("Train")
report_scores_multi(model, X_train, y_multi_train)

print("Dev")
report_scores_multi(model, X_dev, y_multi_dev)

start of fit
Train
Accuracy: 98.81
Precision: 98.83
Recall: 98.81
F1: 98.80
                     precision    recall  f1-score   support

                BAD       0.99      1.00      0.99      2063
             ASPECT       1.00      0.92      0.96       337
     PREDICATE-FULL       0.98      1.00      0.99       983
PREDICATE-DEPENDENT       1.00      0.98      0.99       488

           accuracy                           0.99      3871
          macro avg       0.99      0.97      0.98      3871
       weighted avg       0.99      0.99      0.99      3871

Dev
Accuracy: 75.05
Precision: 74.08
Recall: 75.05
F1: 70.94
                     precision    recall  f1-score   support

                BAD       0.75      0.97      0.85       213
             ASPECT       1.00      0.06      0.12        32
     PREDICATE-FULL       0.84      0.80      0.82       167
PREDICATE-DEPENDENT       0.19      0.10      0.13        49

           accuracy                           0.75       461
    

In [25]:
# Evaluation
print("Test")
report_scores_multi(model, X_test, y_multi_test)

Test
Accuracy: 71.38
Precision: 71.63
Recall: 71.38
F1: 67.18
                     precision    recall  f1-score   support

                BAD       0.73      0.91      0.81       303
             ASPECT       1.00      0.15      0.27        65
     PREDICATE-FULL       0.74      0.83      0.78       162
PREDICATE-DEPENDENT       0.39      0.19      0.26        78

           accuracy                           0.71       608
          macro avg       0.71      0.52      0.53       608
       weighted avg       0.72      0.71      0.67       608



In [26]:
# Evaluation
print("Test manual")
report_scores_multi(model, X_test_manual_multi, y_multi_test_manual)

Test manual
Accuracy: 28.93
Precision: 81.19
Recall: 28.93
F1: 38.87
                     precision    recall  f1-score   support

                BAD       0.00      0.00      0.00         0
             ASPECT       1.00      0.04      0.08       177
     PREDICATE-FULL       0.82      0.44      0.58       445
PREDICATE-DEPENDENT       0.54      0.11      0.18       135

           accuracy                           0.29       757
          macro avg       0.59      0.15      0.21       757
       weighted avg       0.81      0.29      0.39       757



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


### K neighbors

In [27]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=10)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 89.36
Precision: 89.55
Recall: 89.36
F1: 89.31
Dev
Accuracy: 80.48
Precision: 80.51
Recall: 80.48
F1: 80.49


In [28]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 80.76
Precision: 80.86
Recall: 80.76
F1: 80.74


In [29]:
# Evaluation
print("Test manual")
report_scores(model, X_test_manual_bin, y_bin_test_manual)

Test manual
Accuracy: 57.88
Precision: 100.00
Recall: 57.88
F1: 73.32


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [39]:
model.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [40]:
parameters_grid = {
    'n_neighbors': range(1, 11),
    }

cv = StratifiedShuffleSplit(test_size=0.2, random_state=0)
grid_cv = GridSearchCV(model, parameters_grid, scoring='accuracy', cv=cv)

grid_cv.fit(X_train, y_bin_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None, param_grid={'n_neighbors': range(1, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [41]:
grid_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [42]:
grid_cv.best_score_

0.9660645161290322

In [43]:
grid_cv.best_params_

{'n_neighbors': 1}

In [30]:
model = KNeighborsClassifier(n_neighbors=10)

print("start of fit")
model.fit(X_train, y_multi_train)

# Evaluation
print("Train")
report_scores_multi(model, X_train, y_multi_train)

print("Dev")
report_scores_multi(model, X_dev, y_multi_dev)

start of fit
Train
Accuracy: 86.46
Precision: 86.60
Recall: 86.46
F1: 86.09
                     precision    recall  f1-score   support

                BAD       0.87      0.95      0.91      2063
             ASPECT       0.91      0.64      0.75       337
     PREDICATE-FULL       0.84      0.85      0.85       983
PREDICATE-DEPENDENT       0.88      0.69      0.77       488

           accuracy                           0.86      3871
          macro avg       0.87      0.78      0.82      3871
       weighted avg       0.87      0.86      0.86      3871

Dev
Accuracy: 72.89
Precision: 72.13
Recall: 72.89
F1: 72.43
                     precision    recall  f1-score   support

                BAD       0.77      0.81      0.79       213
             ASPECT       0.19      0.19      0.19        32
     PREDICATE-FULL       0.83      0.83      0.83       167
PREDICATE-DEPENDENT       0.49      0.39      0.43        49

           accuracy                           0.73       461
    

In [31]:
# Evaluation
print("Test")
report_scores_multi(model, X_test, y_multi_test)

Test
Accuracy: 68.91
Precision: 68.12
Recall: 68.91
F1: 68.28
                     precision    recall  f1-score   support

                BAD       0.78      0.84      0.81       303
             ASPECT       0.72      0.52      0.61        65
     PREDICATE-FULL       0.67      0.69      0.68       162
PREDICATE-DEPENDENT       0.27      0.24      0.26        78

           accuracy                           0.69       608
          macro avg       0.61      0.57      0.59       608
       weighted avg       0.68      0.69      0.68       608



In [32]:
# Evaluation
print("Test manual")
report_scores_multi(model, X_test_manual_multi, y_multi_test_manual)

Test manual
Accuracy: 42.67
Precision: 73.46
Recall: 42.67
F1: 52.02
                     precision    recall  f1-score   support

                BAD       0.00      0.00      0.00         0
             ASPECT       0.87      0.22      0.35       177
     PREDICATE-FULL       0.83      0.60      0.70       445
PREDICATE-DEPENDENT       0.26      0.11      0.16       135

           accuracy                           0.43       757
          macro avg       0.49      0.23      0.30       757
       weighted avg       0.73      0.43      0.52       757



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [42]:
parameters_grid = {
    'n_neighbors': range(1, 20),
    }

cv = StratifiedShuffleSplit(test_size=0.2, random_state=0)
grid_cv = GridSearchCV(model, parameters_grid, scoring='f1_weighted', cv=cv, n_jobs=4)

grid_cv.fit(X_train, y_multi_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=10, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=4, param_grid={'n_neighbors': range(1, 20)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_weighted', verbose=0)

In [43]:
grid_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [44]:
grid_cv.best_score_

0.9571938251800783

In [45]:
grid_cv.best_params_

{'n_neighbors': 1}

### Support Vector Classifier

In [33]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', gamma='auto')

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 86.64
Precision: 86.68
Recall: 86.64
F1: 86.65
Dev
Accuracy: 88.29
Precision: 88.53
Recall: 88.29
F1: 88.30


In [34]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 83.22
Precision: 83.25
Recall: 83.22
F1: 83.22


In [35]:
# Evaluation
print("Test manual")
report_scores(model, X_test_manual_bin, y_bin_test_manual)

Test manual
Accuracy: 61.82
Precision: 100.00
Recall: 61.82
F1: 76.41


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [36]:
model = SVC(kernel='rbf', gamma='auto')

print("start of fit")
model.fit(X_train, y_multi_train)

# Evaluation
print("Train")
report_scores_multi(model, X_train, y_multi_train)

print("Dev")
report_scores_multi(model, X_dev, y_multi_dev)

start of fit
Train
Accuracy: 82.64
Precision: 83.54
Recall: 82.64
F1: 81.30
                     precision    recall  f1-score   support

                BAD       0.86      0.92      0.89      2063
             ASPECT       1.00      0.36      0.53       337
     PREDICATE-FULL       0.77      0.93      0.84       983
PREDICATE-DEPENDENT       0.76      0.55      0.64       488

           accuracy                           0.83      3871
          macro avg       0.85      0.69      0.72      3871
       weighted avg       0.84      0.83      0.81      3871

Dev
Accuracy: 77.01
Precision: 70.74
Recall: 77.01
F1: 72.61
                     precision    recall  f1-score   support

                BAD       0.75      0.95      0.83       213
             ASPECT       0.00      0.00      0.00        32
     PREDICATE-FULL       0.83      0.84      0.84       167
PREDICATE-DEPENDENT       0.57      0.24      0.34        49

           accuracy                           0.77       461
    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [37]:
# Evaluation
print("Test")
report_scores_multi(model, X_test, y_multi_test)

Test
Accuracy: 73.03
Precision: 73.28
Recall: 73.03
F1: 70.12
                     precision    recall  f1-score   support

                BAD       0.75      0.91      0.82       303
             ASPECT       1.00      0.28      0.43        65
     PREDICATE-FULL       0.74      0.81      0.77       162
PREDICATE-DEPENDENT       0.45      0.24      0.32        78

           accuracy                           0.73       608
          macro avg       0.73      0.56      0.59       608
       weighted avg       0.73      0.73      0.70       608



In [38]:
# Evaluation
print("Test manual")
report_scores_multi(model, X_test_manual_multi, y_multi_test_manual)

Test manual
Accuracy: 36.86
Precision: 78.75
Recall: 36.86
F1: 44.64
                     precision    recall  f1-score   support

                BAD       0.00      0.00      0.00         0
             ASPECT       1.00      0.07      0.14       177
     PREDICATE-FULL       0.79      0.58      0.67       445
PREDICATE-DEPENDENT       0.50      0.07      0.13       135

           accuracy                           0.37       757
          macro avg       0.57      0.18      0.23       757
       weighted avg       0.79      0.37      0.45       757



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [57]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(10, 5, 3), random_state=1)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 99.77
F1: 99.77
Dev
Accuracy: 76.14
F1: 76.14


In [58]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 82.73
F1: 82.57


In [59]:
model = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(10, 5, 3), random_state=1)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_multi_train)

print("Dev")
report_scores(model, X_dev, y_multi_dev)

start of fit
Train
Accuracy: 61.87
F1: 32.80
Dev
Accuracy: 40.56
F1: 21.62


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [60]:
# Evaluation
print("Test")
report_scores(model, X_test, y_multi_test)

Test
Accuracy: 50.82
F1: 25.60


  'precision', 'predicted', average, warn_for)
