# Data loading

In [1]:
import pandas as pd

In [2]:
names = ["OBJECT A", "OBJECT B", "ASPECT", "MOST FREQUENT RATING", "SENTENCE"]
df_train = pd.read_csv("classification_fine_grained/train_clf_fine_grained.csv", header=None, names=names)
df_test = pd.read_csv("classification_fine_grained/test_clf_fine_grained.csv", header=None, names=names)
df_dev = pd.read_csv("classification_fine_grained/dev_clf_fine_grained.csv", header=None, names=names)

df_train.head(10)

Unnamed: 0,OBJECT A,OBJECT B,ASPECT,MOST FREQUENT RATING,SENTENCE
0,golf,hockey,spot,BAD,color: black tv show: 24 nhl team growing up: ...
1,golf,hockey,spot,BAD,miro's favorite things: color: blue team growi...
2,golf,hockey,greater,BAD,"in fact, athletes in individual sports (i.e., ..."
3,golf,hockey,greater,BAD,"first, athletes in individual sports (i.e., go..."
4,golf,hockey,greater,BAD,"football, with its greater numbers and establi..."
5,golf,hockey,greater,BAD,"whereas sky has not just 4 dedicated channels,..."
6,golf,hockey,prof,BAD,"being in a massive hockey market, i can't thin..."
7,golf,hockey,prof,BAD,prof. mcphee got the idea for the puck-smackin...
8,golf,hockey,mcphee,BAD,prof. mcphee got the idea for the puck-smackin...
9,hockey,golf,faster,PREDICATE-FULL,however hockey is much faster game than golf.


In [2]:
names = ["OBJECT A", "OBJECT B", "ASPECT", "MOST FREQUENT RATING", "SENTENCE"]
df_test_manual_bin = pd.read_csv("classification_binary/test_manual_clf_binary.csv", header=None, names=names)
df_test_manual_multi = pd.read_csv("classification_fine_grained/test_manual_clf_fine_grained.csv", header=None, names=names)

In [19]:
print("Smaples number:")
print("train:\t" + str(df_train.shape[0]))
print("dev:\t" + str(df_dev.shape[0]))
print("test:\t" + str(df_test.shape[0]))
print("test:\t" + str(df_test.shape[0]))

Smaples number:
train:	3871
dev:	461
test:	608


In [3]:
print("Smaples number:")
print("test manual bin:\t" + str(df_test_manual_bin.shape[0]))
print("test manual multi:\t" + str(df_test_manual_multi.shape[0]))

Smaples number:
test manual bin:	736
test manual multi:	757


# Preprocessing

In [4]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('wordnet')

def get_list_of_tokens(df_texts):
    stop_words=set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = []
    texts = df_texts["SENTENCE"].values
    for i in range(len(texts)):
        row = texts[i]
        # remove punctuation
        for ch in string.punctuation:
            row = row.replace(ch, " ")
        row = row.replace("   ", " ")
        row = row.replace("  ", " ")
        temp_line = []
        # remove stop words
        for word in row.split():
            if word not in stop_words:
                temp_line.append(word)
        row = ' '.join(temp_line)
        # lemmatization
        temp_line = []
        for word in row.split():
            temp_line.append(wordnet_lemmatizer.lemmatize(word))
        tokens.append(temp_line)
    return tokens

In [3]:
tokens_test = get_list_of_tokens(df_test)
tokens_train = get_list_of_tokens(df_train)
tokens_dev = get_list_of_tokens(df_dev)

tokens_test[:2]

[['merely',
  'saying',
  'think',
  'io',
  'worse',
  'multitasking',
  'android',
  'le',
  'interesting',
  'compare',
  'contrast',
  'earlier',
  'useful',
  'solution',
  'io',
  'android'],
 ['plus',
  'android',
  'developing',
  'way',
  'faster',
  'io',
  'chance',
  'become',
  'laptop',
  'replacement',
  'earlier',
  'io']]

In [5]:
tokens_test_manual_bin = get_list_of_tokens(df_test_manual_bin)
tokens_test_manual_multi = get_list_of_tokens(df_test_manual_multi)

# Loading Word2Vec

In [6]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)

# Creation of a sentences of embeddings

In [7]:
def create_sentence_embeddings(words_list):
    sentence_embedding = []
    for word in words_list:
        try:
            sentence_embedding.append(model[word])
        except KeyError:
            print(word + " is not in the vocabulary, skipping...")
    return sentence_embedding

def create_embeddings_from_data(df_data, df_tokens):
    list_data = df_data.values.tolist()
    for i in range(df_data.shape[0]):
        object_a_embedding = create_sentence_embeddings(df_data["OBJECT A"][i].split())
        object_b_embedding = create_sentence_embeddings(df_data["OBJECT B"][i].split())
        aspect_embedding = create_sentence_embeddings(df_data["ASPECT"][i].split())
        sentence_embedding = create_sentence_embeddings(df_tokens[i])
        list_data[i].insert(6, object_a_embedding)
        list_data[i].insert(7, object_b_embedding)
        list_data[i].insert(8, aspect_embedding)
        list_data[i].insert(9, sentence_embedding)
    return list_data
    

## example of a sentence to embedding permutation

In [8]:
sentence = create_sentence_embeddings(tokens_test[0])

print("size of a sentence: " + str(len(tokens_test[0])))
print("size of a sentence of embeddings without rare words: " + str(len(sentence)))

size of a sentence: 16
size of a sentence of embeddings without rare words: 16


In [9]:
list_train = create_embeddings_from_data(df_train, tokens_train)
list_dev = create_embeddings_from_data(df_dev, tokens_dev)
list_test = create_embeddings_from_data(df_test, tokens_test)

24 is not in the vocabulary, skipping...
nordiques is not in the vocabulary, skipping...
bourque is not in the vocabulary, skipping...
czechoslovakian is not in the vocabulary, skipping...
lemieux is not in the vocabulary, skipping...
centre is not in the vocabulary, skipping...
lebowski is not in the vocabulary, skipping...
slovakia is not in the vocabulary, skipping...
42 is not in the vocabulary, skipping...
000 is not in the vocabulary, skipping...
17 is not in the vocabulary, skipping...
000 is not in the vocabulary, skipping...
000 is not in the vocabulary, skipping...
programme is not in the vocabulary, skipping...
bastien is not in the vocabulary, skipping...
mcphee is not in the vocabulary, skipping...
mcphee is not in the vocabulary, skipping...
mcphee is not in the vocabulary, skipping...
23 is not in the vocabulary, skipping...
dupuis is not in the vocabulary, skipping...
mickelson is not in the vocabulary, skipping...
caltech is not in the vocabulary, skipping...
jaronczyk

firebirdsql is not in the vocabulary, skipping...
firebirdsql is not in the vocabulary, skipping...
firebirdsql is not in the vocabulary, skipping...
olap is not in the vocabulary, skipping...
innodb is not in the vocabulary, skipping...
innodb is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
and is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
psone is not in the vocabulary, skipping...
16 is not in the vocabulary, skipping...
32 is not in the vocabulary, skipping...
r4000 is not in the vocabulary, skipping...
lcs is not in the vocabulary, skipping...
08pm is not in the vocabulary, skipping...
26 is not in the vocabulary, skipping...
2004 is not in the vocabulary, skipping...
11 is not in the vocabulary, skipping...
11pm is not in the vocabulary, skipping...
20 is not in the vocabulary, skipping...
2006 is not in the vocabulary, skipping...
3000 is not in the vocabulary, skipping...
2000 is not in the vocabulary, skipping...


90 is not in the vocabulary, skipping...
1000 is not in the vocabulary, skipping...
realise is not in the vocabulary, skipping...
visualage is not in the vocabulary, skipping...
qbasic is not in the vocabulary, skipping...
cincom is not in the vocabulary, skipping...
javabeans is not in the vocabulary, skipping...
97b is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
accessors is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
a is not in the vocabulary, skipping...
360 is not in the vocabulary, skipping...
360 is not in the vocabulary, skipping...
360 is not in the vocabulary, skipping...
57 is not in the vocabulary, skipping...
000 is not in the vocabulary, skipping...
360 is not in the vocabulary, skipping...
360 is not in the vocabulary, skipping...
360 is not in the vocabulary, skipping...
360 is not in 

In [8]:
list_test_manual_bin = create_embeddings_from_data(df_test_manual_bin, tokens_test_manual_bin)
list_test_manual_multi = create_embeddings_from_data(df_test_manual_multi, tokens_test_manual_multi)

- is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
and is not in the vocabulary, skipping...
sheryl is not in the vocabulary, skipping...
sandberg is not in the vocabulary, skipping...
sheryl is not in the vocabulary, skipping...
sandberg is not in the vocabulary, skipping...
. is not in the vocabulary, skipping...
. is not in the vocabulary, skipping...
gmp is not in the vocabulary, skipping...
gmp is not in the vocabulary, skipping...
18 is not in the vocabulary, skipping...
10x is not in the vocabulary, skipping...
- is not in the vocabulary, skipping...
of is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
and is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskel

haskell is not in the vocabulary, skipping...
typeful is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
++, is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
typeful is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
to is not in the vocabulary, skipping...
2007 is not in the vocabulary, skipping...
01 is not in the vocabulary, skipping...
26 is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
quickcheck is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
haskell is not in the vocabulary, skipping...
ocaml i

In [10]:
len(list_train[0])

9

# Save dataset with pickle

In [9]:
import pickle
def save_embeddings_to_pickle(list_data, filename):
    pickle_out = open(filename, "wb")
    pickle.dump(list_data, pickle_out)
    pickle_out.close()

In [12]:
import pickle

folder = 'embeddings/W2V/'
save_embeddings_to_pickle(list_train, folder + "trainw2v.pickle")
save_embeddings_to_pickle(list_test, folder + "testw2v.pickle")
save_embeddings_to_pickle(list_dev, folder + "devw2v.pickle")

# pickle_out = open("w2v/" + str(category) + "/" + str(category) + "testw2v.pickle", "wb")
# pickle.dump(list_test, pickle_out)
# pickle_out.close()

In [10]:
folder = 'embeddings/W2V/'
save_embeddings_to_pickle(list_test_manual_bin, folder + "test_manual_binw2v.pickle")
save_embeddings_to_pickle(list_test_manual_multi, folder + "test_manual_multiw2v.pickle")

# Open train data

In [13]:
with open(folder + "trainw2v.pickle", "rb") as pickle_in:
    data = pickle.load(pickle_in)

In [14]:
data[0]

['golf',
 'hockey',
 'spot',
 'BAD',
 'color: black tv show: 24 nhl team growing up: quebec nordiques nhl player growing up: ray bourque food: easy to make steak movie: silence of the lambs car: bmw m5 brand of skates: ccm brand of hockey stick: tps former islander: mike bossy board game: monopoly sport (other than hockey): golf superpower: being unbeatable band: nickelback vacation spot: hawaii comic character: bart simpson non-north american city: stockholm snack food: yogurt stadium food: montreal hot dogs',
 [array([-0.01757812,  0.04907227,  0.08496094,  0.04711914, -0.11279297,
         -0.08398438, -0.06298828, -0.0559082 ,  0.28515625,  0.01312256,
          0.05224609, -0.38476562, -0.19042969,  0.15039062,  0.17773438,
          0.01403809,  0.17675781,  0.328125  ,  0.26953125, -0.08691406,
         -0.19140625,  0.16796875,  0.06835938,  0.16015625,  0.03540039,
          0.07373047, -0.41601562,  0.33984375,  0.09716797,  0.01672363,
          0.00738525,  0.12353516,  0.0