## Loading embeddings

In [1]:
import pickle

folder = 'embeddings/ELMo/'

with open(folder + "trainelmo.pickle", "rb") as pickle_in:
    train_data = pickle.load(pickle_in)
with open(folder + "develmo.pickle", "rb") as pickle_in:
    dev_data = pickle.load(pickle_in)
with open(folder + "testelmo.pickle", "rb") as pickle_in:
    test_data = pickle.load(pickle_in)

## Loading dataframes

In [2]:
import pandas as pd
import numpy as np

names = ["OBJECT A", "OBJECT B", "ASPECT", "MOST FREQUENT RATING", "SENTENCE"]
df_train = pd.read_csv("classification_fine_grained/train_clf_fine_grained.csv", header=None, names=names)
df_test = pd.read_csv("classification_fine_grained/test_clf_fine_grained.csv", header=None, names=names)
df_dev = pd.read_csv("classification_fine_grained/dev_clf_fine_grained.csv", header=None, names=names)

df_train.shape

(3871, 5)

### Dropping duplicate object-aspect combinations  
ONLY RUN IF YOU ARE NOT GOING TO USE SENTENCES

In [4]:
def drop_dups(data, df):
    df.drop('SENTENCE', axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    new_data = []
    for i in range(len(df.index)):
        new_data.append(data[df.index[i]])
    return new_data

train_data = drop_dups(train_data, df_train)
dev_data = drop_dups(dev_data, df_dev)
test_data = drop_dups(test_data, df_test)
len(train_data)

1206

In [3]:
print("len train " + str(len(train_data)))
print("len dev " + str(len(dev_data)))
print("len test " + str(len(test_data)))

len train 3871
len dev 461
len test 608


In [5]:
token_limit = 25

under_limit = 0
for i in range(len(train_data)):
    if len(train_data[i][8]) <= token_limit:
        under_limit += 1
under_limit /= len(train_data)
print('sentences under token limit(' + str(token_limit) + '): ' + str(under_limit))

sentences under token limit(25): 0.9609919917334022


### Filling empty embeddings with zeros (in cases w2v does not contain the words mentioned)

In [4]:
def fill_empty(data):
    embedding_dim = 300
    new_data = []
    for i in range(len(data)):
        for j in range(5, 9):
            if len(data[i][j]) == 0:
                data[i][j].append(np.zeros(embedding_dim))
        new_data.append(data[i])
    return new_data

train_data = fill_empty(train_data)
dev_data = fill_empty(dev_data)
test_data = fill_empty(test_data)
print("len train " + str(len(train_data)))
print("len dev " + str(len(dev_data)))
print("len test " + str(len(test_data)))

len train 3871
len dev 461
len test 608


### Deleting samples with no embeddings (another way to treat empty embeddings)

In [7]:
def delete_empty(data):
    new_data = []
    for i in range(len(data)):
        ok = True
        for j in range(5, 9):
            if len(data[i][j]) == 0:
                ok = False
                break
        if ok:
            new_data.append(data[i])
    return new_data

train_data = delete_empty(train_data)
dev_data = delete_empty(dev_data)
test_data = delete_empty(test_data)
print("len train " + str(len(train_data)))
print("len dev " + str(len(dev_data)))
print("len test " + str(len(test_data)))

len train 3743
len dev 451
len test 559


### Shuffling the train data

In [4]:
import random

random.shuffle(train_data)

In [5]:
def get_output_for_binary(data):
    out = []
#     new_data = []
    for i in range(len(data)):
#         new_data.append(data[i])
#         if random.choice([True, False]):
#             new_data[i][5] = data[i][6]
#             new_data[i][6] = data[i][5]
#             out.append(1)
#         else:
#             out.append(0)
        if data[i][3] == 'BAD':
            out.append(0)
        else:
            out.append(1)
    return out

y_bin_train = get_output_for_binary(train_data)
y_bin_dev = get_output_for_binary(dev_data)
y_bin_test = get_output_for_binary(test_data)

In [6]:
print("Samples number:")
print("train:\t" + str(len(y_bin_train)))
print("dev:\t" + str(len(y_bin_dev)))
print("test:\t" + str(len(y_bin_test)))

Samples number:
train:	3871
dev:	461
test:	608


In [7]:
def get_output_for_multi(data):
    out = []
#     new_data = []
    for i in range(len(data)):
#         new_data.append(data[i])
#         if random.choice([True, False]):
#             new_data[i][5] = data[i][6]
#             new_data[i][6] = data[i][5]
#             out.append(1)
#         else:
#             out.append(0)
        if data[i][3] == 'BAD':
            out.append(0)
        elif data[i][3] == 'ASPECT':
            out.append(1)
        elif data[i][3] == 'PREDICATE-FULL':
            out.append(2)
        elif data[i][3] == 'PREDICATE-DEPENDENT':
            out.append(3)
    return out

y_multi_train = get_output_for_multi(train_data)
y_multi_dev = get_output_for_multi(dev_data)
y_multi_test = get_output_for_multi(test_data)

In [8]:
print("Samples number:")
print("train:\t" + str(len(y_multi_train)))
print("dev:\t" + str(len(y_multi_dev)))
print("test:\t" + str(len(y_multi_test)))

Samples number:
train:	3871
dev:	461
test:	608


In [9]:
def get_avg_embedding(embeddings):
    avg = embeddings[0]
    for i in range(1, len(embeddings)):
        avg = [avg[j] + embeddings[i][j] for j in range(len(avg))]
    for i in range(len(avg)):
        avg[i] /= len(embeddings)
    return avg

def max_pooling(embeddings):
    max = embeddings[0]
    for i in range(1, len(embeddings)):
        for j in range(len(avg)):
            if max[j] < embeddings[i][j]:
                max[j] = embeddings[i][j]
    return max

def get_input_vectors(data):
    vectors = []
    max_tokens = 26
    for i in range(len(data)):
#         if data[i][4].find(data[i][0]) > data[i][4].find(data[i][1]):
#             first_object = np.array([1], dtype='float32')
#         else:
#             first_object = np.array([0], dtype='float32')
#         object_a = get_avg_embedding(data[i][5])
#         object_b = get_avg_embedding(data[i][6])
#         aspect = get_avg_embedding(data[i][7])
#         sentence = get_avg_embedding(data[i][8])
#         if len(data[i][8]) < max_tokens:
#             sentence = np.concatenate(data[i][8])
#             to_add = max_tokens - len(data[i][8])
#             for j in range(to_add):
#                 sentence = np.concatenate((sentence, np.zeros(300)))
#         else:
#             sentence = np.concatenate(data[i][8][:max_tokens])
#         vectors.append(np.concatenate((object_a, object_b, aspect)))
        object_a = data[i][5]
        object_b = data[i][6]
        aspect = data[i][7]
        sentence = data[i][8]
        vectors.append(np.concatenate((object_a, object_b, aspect, sentence)))
#         vectors.append(np.array([object_a, object_b, aspect, sentence]).mean(axis=0))
#         vectors.append(np.concatenate((first_object, sentence)))
    return vectors

X_train = get_input_vectors(train_data)
X_dev = get_input_vectors(dev_data)
X_test = get_input_vectors(test_data)

In [10]:
X_train[0].shape

(4096,)

### Check classes balance

In [11]:
pd.Series(y_bin_train).value_counts(normalize=True)

0    0.532937
1    0.467063
dtype: float64

In [12]:
pd.Series(y_multi_train).value_counts(normalize=True)

0    0.532937
2    0.253940
3    0.126066
1    0.087058
dtype: float64

## Models

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

def report_scores(model, X, y):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    pr = precision_score(y, y_pred, average='weighted')
    re = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    print("Accuracy: {:.2f}".format(acc * 100))
    print("Precision: {:.2f}".format(pr * 100))
    print("Recall: {:.2f}".format(re * 100))
    print("F1: {:.2f}".format(f1 * 100))
    
    
def report_scores_multi(model, X, y):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    pr = precision_score(y, y_pred, average='weighted')
    re = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    print("Accuracy: {:.2f}".format(acc * 100))
    print("Precision: {:.2f}".format(pr * 100))
    print("Recall: {:.2f}".format(re * 100))
    print("F1: {:.2f}".format(f1 * 100))
    target_names = ['BAD', 'ASPECT', 'PREDICATE-FULL', 'PREDICATE-DEPENDENT']
    print(classification_report(y, y_pred, target_names=target_names))

### Random forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, max_depth=9, min_samples_leaf=1)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 98.22
Precision: 98.25
Recall: 98.22
F1: 98.22
Dev
Accuracy: 84.38
Precision: 85.63
Recall: 84.38
F1: 84.37


In [22]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 81.74
Precision: 82.67
Recall: 81.74
F1: 81.62


In [16]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

model = RandomForestClassifier()
# print(model.get_params().keys())
parameters_grid = {
    'n_estimators': [10, 20, 30, 50, 80, 100, 200],
    # 'min_samples_split': range(11, 17),
    'min_samples_leaf': range(1, 10, 2),
    'max_depth': range(3, 10)
    }

cv = StratifiedShuffleSplit(test_size=0.2, random_state=0)
grid_cv = GridSearchCV(model, parameters_grid, scoring='accuracy', cv=cv, n_jobs=4)

grid_cv.fit(X_train, y_bin_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              rand

In [17]:
grid_cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
grid_cv.best_score_

0.9583225806451613

In [20]:
grid_cv.best_params_

{'max_depth': 9, 'min_samples_leaf': 1, 'n_estimators': 200}

In [37]:
model = RandomForestClassifier(n_estimators=100, max_depth=9, min_samples_leaf=1)

print("start of fit")
model.fit(X_train, y_multi_train)

# Evaluation
print("Train")
report_scores_multi(model, X_train, y_multi_train)

print("Dev")
report_scores_multi(model, X_dev, y_multi_dev)

start of fit
Train
Accuracy: 97.83
Precision: 97.84
Recall: 97.83
F1: 97.83
                     precision    recall  f1-score   support

                BAD       0.98      0.99      0.98      2063
             ASPECT       1.00      0.95      0.98       337
     PREDICATE-FULL       0.97      0.97      0.97       983
PREDICATE-DEPENDENT       0.99      0.97      0.98       488

           accuracy                           0.98      3871
          macro avg       0.98      0.97      0.98      3871
       weighted avg       0.98      0.98      0.98      3871

Dev
Accuracy: 75.27
Precision: 75.01
Recall: 75.27
F1: 73.23
                     precision    recall  f1-score   support

                BAD       0.73      0.94      0.82       213
             ASPECT       0.62      0.16      0.25        32
     PREDICATE-FULL       0.85      0.72      0.78       167
PREDICATE-DEPENDENT       0.58      0.43      0.49        49

           accuracy                           0.75       461
    

In [38]:
# Evaluation
print("Test")
report_scores_multi(model, X_test, y_multi_test)

Test
Accuracy: 75.82
Precision: 77.27
Recall: 75.82
F1: 73.76
                     precision    recall  f1-score   support

                BAD       0.74      0.93      0.82       303
             ASPECT       1.00      0.31      0.47        65
     PREDICATE-FULL       0.85      0.79      0.82       162
PREDICATE-DEPENDENT       0.57      0.38      0.46        78

           accuracy                           0.76       608
          macro avg       0.79      0.60      0.64       608
       weighted avg       0.77      0.76      0.74       608



### K neighbors

In [23]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=10)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 86.90
Precision: 87.49
Recall: 86.90
F1: 86.78
Dev
Accuracy: 65.73
Precision: 65.62
Recall: 65.73
F1: 65.57


In [24]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 76.81
Precision: 77.37
Recall: 76.81
F1: 76.70


In [39]:
model.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [40]:
parameters_grid = {
    'n_neighbors': range(1, 11),
    }

cv = StratifiedShuffleSplit(test_size=0.2, random_state=0)
grid_cv = GridSearchCV(model, parameters_grid, scoring='accuracy', cv=cv)

grid_cv.fit(X_train, y_bin_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None, param_grid={'n_neighbors': range(1, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [41]:
grid_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [42]:
grid_cv.best_score_

0.9660645161290322

In [43]:
grid_cv.best_params_

{'n_neighbors': 1}

In [54]:
model = KNeighborsClassifier(n_neighbors=10)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores_multi(model, X_train, y_multi_train)

print("Dev")
report_scores_multi(model, X_dev, y_multi_dev)

start of fit
Train


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 55.90
Precision: 45.73
Recall: 55.90
F1: 49.20
                     precision    recall  f1-score   support

                BAD       0.83      0.94      0.88      2063
             ASPECT       0.14      0.66      0.24       337
     PREDICATE-FULL       0.00      0.00      0.00       983
PREDICATE-DEPENDENT       0.00      0.00      0.00       488

           accuracy                           0.56      3871
          macro avg       0.24      0.40      0.28      3871
       weighted avg       0.46      0.56      0.49      3871

Dev
Accuracy: 28.20
Precision: 29.75
Recall: 28.20
F1: 28.54
                     precision    recall  f1-score   support

                BAD       0.64      0.59      0.61       213
             ASPECT       0.02      0.16      0.03        32
     PREDICATE-FULL       0.00      0.00      0.00       167
PREDICATE-DEPENDENT       0.00      0.00      0.00        49

           accuracy                           0.28       461
          macro avg    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [55]:
# Evaluation
print("Test")
report_scores_multi(model, X_test, y_multi_test)

Test
Accuracy: 46.71
Precision: 37.81
Recall: 46.71
F1: 40.97
                     precision    recall  f1-score   support

                BAD       0.73      0.84      0.78       303
             ASPECT       0.11      0.46      0.18        65
     PREDICATE-FULL       0.00      0.00      0.00       162
PREDICATE-DEPENDENT       0.00      0.00      0.00        78

           accuracy                           0.47       608
          macro avg       0.21      0.32      0.24       608
       weighted avg       0.38      0.47      0.41       608



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [42]:
parameters_grid = {
    'n_neighbors': range(1, 20),
    }

cv = StratifiedShuffleSplit(test_size=0.2, random_state=0)
grid_cv = GridSearchCV(model, parameters_grid, scoring='f1_weighted', cv=cv, n_jobs=4)

grid_cv.fit(X_train, y_multi_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=10, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=4, param_grid={'n_neighbors': range(1, 20)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_weighted', verbose=0)

In [43]:
grid_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [44]:
grid_cv.best_score_

0.9571938251800783

In [45]:
grid_cv.best_params_

{'n_neighbors': 1}

### Support Vector Classifier

In [26]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', gamma='auto')

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 81.35
Precision: 82.45
Recall: 81.35
F1: 81.03
Dev
Accuracy: 79.61
Precision: 84.51
Recall: 79.61
F1: 79.21


In [27]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 77.80
Precision: 80.42
Recall: 77.80
F1: 77.32


In [58]:
model = SVC(kernel='rbf', gamma='auto')

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores_multi(model, X_train, y_multi_train)

print("Dev")
report_scores_multi(model, X_dev, y_multi_dev)

start of fit
Train


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 49.78
Precision: 41.40
Recall: 49.78
F1: 45.07
                     precision    recall  f1-score   support

                BAD       0.77      0.92      0.84      2063
             ASPECT       0.02      0.09      0.03       337
     PREDICATE-FULL       0.00      0.00      0.00       983
PREDICATE-DEPENDENT       0.00      0.00      0.00       488

           accuracy                           0.50      3871
          macro avg       0.20      0.25      0.22      3871
       weighted avg       0.41      0.50      0.45      3871

Dev
Accuracy: 45.12
Precision: 32.36
Recall: 45.12
F1: 37.69
                     precision    recall  f1-score   support

                BAD       0.70      0.98      0.82       213
             ASPECT       0.00      0.00      0.00        32
     PREDICATE-FULL       0.00      0.00      0.00       167
PREDICATE-DEPENDENT       0.00      0.00      0.00        49

           accuracy                           0.45       461
          macro avg    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [59]:
# Evaluation
print("Test")
report_scores_multi(model, X_test, y_multi_test)

Test
Accuracy: 46.88
Precision: 35.84
Recall: 46.88
F1: 40.54
                     precision    recall  f1-score   support

                BAD       0.71      0.92      0.81       303
             ASPECT       0.02      0.08      0.04        65
     PREDICATE-FULL       0.00      0.00      0.00       162
PREDICATE-DEPENDENT       0.00      0.00      0.00        78

           accuracy                           0.47       608
          macro avg       0.18      0.25      0.21       608
       weighted avg       0.36      0.47      0.41       608



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [57]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(10, 5, 3), random_state=1)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 99.77
F1: 99.77
Dev
Accuracy: 76.14
F1: 76.14


In [58]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 82.73
F1: 82.57


In [59]:
model = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(10, 5, 3), random_state=1)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_multi_train)

print("Dev")
report_scores(model, X_dev, y_multi_dev)

start of fit
Train
Accuracy: 61.87
F1: 32.80
Dev
Accuracy: 40.56
F1: 21.62


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [60]:
# Evaluation
print("Test")
report_scores(model, X_test, y_multi_test)

Test
Accuracy: 50.82
F1: 25.60


  'precision', 'predicted', average, warn_for)
