In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import pickle
import random

## Loading embeddings

In [2]:
folder = 'embeddings/W2V/'

with open(folder + "trainw2v.pickle", "rb") as pickle_in:
    train_data = pickle.load(pickle_in)
with open(folder + "devw2v.pickle", "rb") as pickle_in:
    dev_data = pickle.load(pickle_in)
with open(folder + "testw2v.pickle", "rb") as pickle_in:
    test_data = pickle.load(pickle_in)

## Loading dataframes

In [3]:
names = ["OBJECT A", "OBJECT B", "ASPECT", "MOST FREQUENT RATING", "SENTENCE"]
df_train = pd.read_csv("classification_fine_grained/train_clf_fine_grained.csv", header=None, names=names)
df_test = pd.read_csv("classification_fine_grained/test_clf_fine_grained.csv", header=None, names=names)
df_dev = pd.read_csv("classification_fine_grained/dev_clf_fine_grained.csv", header=None, names=names)

df_train.shape

(3871, 5)

### Dropping duplicate object-aspect combinations  
ONLY RUN IF YOU ARE NOT GOING TO USE SENTENCES

In [4]:
def drop_dups(data, df):
    df.drop('SENTENCE', axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    new_data = []
    for i in range(len(df.index)):
        new_data.append(data[df.index[i]])
    return new_data

train_data = drop_dups(train_data, df_train)
dev_data = drop_dups(dev_data, df_dev)
test_data = drop_dups(test_data, df_test)
len(train_data)

1206

In [4]:
print("len train " + str(len(train_data)))
print("len dev " + str(len(dev_data)))
print("len test " + str(len(test_data)))

len train 3871
len dev 461
len test 608


In [5]:
token_limit = 25

under_limit = 0
for i in range(len(train_data)):
    if len(train_data[i][8]) <= token_limit:
        under_limit += 1
under_limit /= len(train_data)
print('sentences under token limit(' + str(token_limit) + '): ' + str(under_limit))

sentences under token limit(25): 0.9609919917334022


### Filling empty embeddings with zeros (in cases w2v does not contain the words mentioned)

In [6]:
def fill_empty(data):
    embedding_dim = 300
    new_data = []
    for i in range(len(data)):
        for j in range(5, 9):
            if len(data[i][j]) == 0:
                data[i][j].append(np.zeros(embedding_dim))
        new_data.append(data[i])
    return new_data

train_data = fill_empty(train_data)
dev_data = fill_empty(dev_data)
test_data = fill_empty(test_data)
print("len train " + str(len(train_data)))
print("len dev " + str(len(dev_data)))
print("len test " + str(len(test_data)))

len train 3871
len dev 461
len test 608


### Deleting samples with no embeddings (another way to treat empty embeddings)

In [7]:
def delete_empty(data):
    new_data = []
    for i in range(len(data)):
        ok = True
        for j in range(5, 9):
            if len(data[i][j]) == 0:
                ok = False
                break
        if ok:
            new_data.append(data[i])
    return new_data

train_data = delete_empty(train_data)
dev_data = delete_empty(dev_data)
test_data = delete_empty(test_data)
print("len train " + str(len(train_data)))
print("len dev " + str(len(dev_data)))
print("len test " + str(len(test_data)))

len train 3743
len dev 451
len test 559


### Shuffling the train data

In [7]:
random.shuffle(train_data)

In [9]:
def get_output_for_binary(data):
    out = []
#     new_data = []
    for i in range(len(data)):
#         new_data.append(data[i])
#         if random.choice([True, False]):
#             new_data[i][5] = data[i][6]
#             new_data[i][6] = data[i][5]
#             out.append(1)
#         else:
#             out.append(0)
        if data[i][3] == 'BAD':
            out.append(0)
        else:
            out.append(1)
    return out

y_bin_train = get_output_for_binary(train_data)
y_bin_dev = get_output_for_binary(dev_data)
y_bin_test = get_output_for_binary(test_data)

In [16]:
print("Samples number:")
print("train:\t" + str(len(y_bin_train)))
print("dev:\t" + str(len(y_bin_dev)))
print("test:\t" + str(len(y_bin_test)))

Samples number:
train:	3871
dev:	461
test:	608


In [13]:
def get_output_for_multi(data):
    out = []
#     new_data = []
    for i in range(len(data)):
#         new_data.append(data[i])
#         if random.choice([True, False]):
#             new_data[i][5] = data[i][6]
#             new_data[i][6] = data[i][5]
#             out.append(1)
#         else:
#             out.append(0)
        if data[i][3] == 'BAD':
            out.append(0)
        elif data[i][3] == 'ASPECT':
            out.append(1)
        elif data[i][3] == 'PREDICATE-FULL':
            out.append(2)
        elif data[i][3] == 'PREDICATE-DEPENDENT':
            out.append(3)
    return out

y_multi_train = get_output_for_multi(train_data)
y_multi_dev = get_output_for_multi(dev_data)
y_multi_test = get_output_for_multi(test_data)

In [15]:
print("Samples number:")
print("train:\t" + str(len(y_multi_train)))
print("dev:\t" + str(len(y_multi_dev)))
print("test:\t" + str(len(y_multi_test)))

Samples number:
train:	3871
dev:	461
test:	608


In [11]:
def get_avg_embedding(embeddings):
    avg = embeddings[0]
    for i in range(1, len(embeddings)):
        avg = [avg[j] + embeddings[i][j] for j in range(len(avg))]
    for i in range(len(avg)):
        avg[i] /= len(embeddings)
    return avg

def max_pooling(embeddings):
    max = embeddings[0]
    for i in range(1, len(embeddings)):
        for j in range(len(avg)):
            if max[j] < embeddings[i][j]:
                max[j] = embeddings[i][j]
    return max

def get_input_vectors(data):
    vectors = []
    max_tokens = 26
    for i in range(len(data)):
#         if data[i][4].find(data[i][0]) > data[i][4].find(data[i][1]):
#             first_object = np.array([1], dtype='float32')
#         else:
#             first_object = np.array([0], dtype='float32')
        object_a = get_avg_embedding(data[i][5])
        object_b = get_avg_embedding(data[i][6])
        aspect = get_avg_embedding(data[i][7])
        sentence = get_avg_embedding(data[i][8])
#         if len(data[i][8]) < max_tokens:
#             sentence = np.concatenate(data[i][8])
#             to_add = max_tokens - len(data[i][8])
#             for j in range(to_add):
#                 sentence = np.concatenate((sentence, np.zeros(300)))
#         else:
#             sentence = np.concatenate(data[i][8][:max_tokens])
#         vectors.append(np.concatenate((object_a, object_b, aspect)))
#         object_a = data[i][5]
#         object_b = data[i][6]
#         aspect = data[i][7]
#         sentence = data[i][8]
        vectors.append(np.concatenate((object_a, object_b, aspect, sentence)))
#         vectors.append(np.array([object_a, object_b, aspect, sentence]).mean(axis=0))
#         vectors.append(np.concatenate((first_object, sentence)))
    return vectors

X_train = get_input_vectors(train_data)
X_dev = get_input_vectors(dev_data)
X_test = get_input_vectors(test_data)

In [12]:
X_train[0].shape

(1200,)

### Check classes balance

In [27]:
pd.Series(y_bin_train).value_counts(normalize=True)

0    0.532937
1    0.467063
dtype: float64

In [28]:
pd.Series(y_multi_train).value_counts(normalize=True)

0    0.532937
2    0.253940
3    0.126066
1    0.087058
dtype: float64

## Models

In [22]:
def report_scores(model, X, y):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    print("Accuracy: {:.2f}".format(acc * 100))
    print("F1: {:.2f}".format(f1 * 100))

### Random forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=80, max_depth=9)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 97.75
F1: 97.75
Dev
Accuracy: 88.94
F1: 88.93


In [35]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 81.91
F1: 81.82


In [30]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

# print(model.get_params().keys())
parameters_grid = {
    'n_estimators': [10, 20, 30, 50, 80, 100],
    # 'criterion': ['gini', 'entropy'],
    # 'min_samples_split': range(11, 17),
    # 'min_samples_leaf': range(1, 10),
    'max_depth': range(3, 10)
    }

cv = StratifiedShuffleSplit(test_size=0.2, random_state=0)
grid_cv = GridSearchCV(model, parameters_grid, scoring='accuracy', cv=cv)

grid_cv.fit(X_train, y_bin_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=4,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=30, n_jobs=None,
                                              oob_score=False,
                                              random_stat

In [31]:
grid_cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
grid_cv.best_score_

0.9532903225806452

In [33]:
grid_cv.best_params_

{'max_depth': 9, 'n_estimators': 80}

In [51]:
model = RandomForestClassifier(n_estimators=80, max_depth=9)

print("start of fit")
model.fit(X_train, y_multi_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_multi_train)

print("Dev")
report_scores(model, X_dev, y_multi_dev)

start of fit
Train
Accuracy: 97.83
F1: 97.61
Dev
Accuracy: 78.31
F1: 61.01


In [52]:
# Evaluation
print("Test")
report_scores(model, X_test, y_multi_test)

Test
Accuracy: 75.99
F1: 65.41


### K neighbors

In [46]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=10)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 86.95
F1: 86.72
Dev
Accuracy: 65.73
F1: 65.27


In [47]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 76.81
F1: 76.70


In [39]:
model.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [40]:
parameters_grid = {
    'n_neighbors': range(1, 11),
    }

cv = StratifiedShuffleSplit(test_size=0.2, random_state=0)
grid_cv = GridSearchCV(model, parameters_grid, scoring='accuracy', cv=cv)

grid_cv.fit(X_train, y_bin_train)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None, param_grid={'n_neighbors': range(1, 11)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [41]:
grid_cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [42]:
grid_cv.best_score_

0.9660645161290322

In [43]:
grid_cv.best_params_

{'n_neighbors': 1}

In [53]:
model = KNeighborsClassifier(n_neighbors=10)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_multi_train)

print("Dev")
report_scores(model, X_dev, y_multi_dev)

start of fit


  'precision', 'predicted', average, warn_for)


Train
Accuracy: 55.90
F1: 28.04
Dev
Accuracy: 28.20
F1: 16.16


  'precision', 'predicted', average, warn_for)


In [54]:
# Evaluation
print("Test")
report_scores(model, X_test, y_multi_test)

Test
Accuracy: 46.71
F1: 24.16


  'precision', 'predicted', average, warn_for)


### Support Vector Classifier

In [48]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', gamma='auto')

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 81.35
F1: 80.82
Dev
Accuracy: 79.61
F1: 79.38


In [49]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 77.80
F1: 77.33


In [55]:
model = SVC(kernel='rbf', gamma='auto')

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_multi_train)

print("Dev")
report_scores(model, X_dev, y_multi_dev)

start of fit
Train


  'precision', 'predicted', average, warn_for)


Accuracy: 49.78
F1: 21.86
Dev
Accuracy: 45.12
F1: 20.39


  'precision', 'predicted', average, warn_for)


In [56]:
# Evaluation
print("Test")
report_scores(model, X_test, y_multi_test)

Test
Accuracy: 46.88
F1: 21.03


  'precision', 'predicted', average, warn_for)


In [57]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(10, 5, 3), random_state=1)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_bin_train)

print("Dev")
report_scores(model, X_dev, y_bin_dev)

start of fit
Train
Accuracy: 99.77
F1: 99.77
Dev
Accuracy: 76.14
F1: 76.14


In [58]:
# Evaluation
print("Test")
report_scores(model, X_test, y_bin_test)

Test
Accuracy: 82.73
F1: 82.57


In [59]:
model = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(10, 5, 3), random_state=1)

print("start of fit")
model.fit(X_train, y_bin_train)

# Evaluation
print("Train")
report_scores(model, X_train, y_multi_train)

print("Dev")
report_scores(model, X_dev, y_multi_dev)

start of fit
Train
Accuracy: 61.87
F1: 32.80
Dev
Accuracy: 40.56
F1: 21.62


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [60]:
# Evaluation
print("Test")
report_scores(model, X_test, y_multi_test)

Test
Accuracy: 50.82
F1: 25.60


  'precision', 'predicted', average, warn_for)
