In [1]:
import csv
import numpy as np
import pandas as pd

# Word2vec
from gensim.models import Word2Vec
from gensim.utils import tokenize
from unidecode import unidecode

In [2]:
# no sentence in the dataset has more than 40 words; almost every sentence has less than 20
max_words = 25
n_dim = 80    # embedding dimension
pca_dim = 16

In [3]:
train_data = pd.read_csv("data/train.csv")

In [4]:
sentences = [list(tokenize(s, deacc=True, lower=True)) for s in train_data['text']]
print(sentences[:5])

[['rt', 'refarcir', 'macron', 'ans', 'nom', 'prepare'], ['populaire'], ['faut', 'degager', 'cingle'], ['enseignants', 'mettre', 'prescriptions', 'president', 'republique', 'faut', 'croire', 'president', 'donner', 'periode', 'compte', 'mesures', 'appliquees', 'grever'], ['mafieuse', 'oppressive', 'macron']]


In [5]:
model = Word2Vec(vector_size=n_dim, window=20, min_count=1, workers=7)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=5)

(15074350, 16375635)

In [6]:
from sklearn.decomposition import PCA

In [7]:
# fiting pca
all_embeddings = []
for idx in range(train_data.shape[0]):
    sentence_embeddings = np.zeros((max_words, n_dim))
    for i, s in enumerate(train_data.loc[idx, "text"].split()):
        try:
            all_embeddings.append(model.wv[unidecode(s).lower()])
        except:
            pass

In [8]:
all_embeddings = np.stack(all_embeddings, axis=0)
print(all_embeddings.shape)

(3274894, 80)


In [9]:
my_pca = PCA(n_components = pca_dim)
my_pca.fit(all_embeddings)

In [10]:
embeddings = np.zeros((train_data.shape[0], max_words, pca_dim))

In [11]:
for idx in range(train_data.shape[0]):
    sentence_embeddings = np.zeros((max_words, pca_dim))
    if idx % 10000 == 0:
        print(idx)
    for i, s in enumerate(train_data.loc[idx, "text"].split()):
        try:
            sentence_embeddings[i, :] = my_pca.transform(model.wv[unidecode(s).lower()].reshape(1,-1))
        except:
            print("failed for index", idx, s)
    embeddings[idx] = sentence_embeddings

0
failed for index 873 corinne
failed for index 873 lepage
failed for index 873 agriculture
failed for index 873 isabelle
failed for index 873 saporta
failed for index 936 honnêteté
failed for index 936 reconnaître
failed for index 1189 fœtus
failed for index 1189 fœtus
failed for index 2735 omar
failed for index 2735 bongo
failed for index 2881 élu
failed for index 2881 francais
failed for index 2881 crédules
failed for index 3602 otanienne
failed for index 4997 biscuit
failed for index 4997 petits
failed for index 4997 palmiers
failed for index 5040 guerre
failed for index 6209 æterna
failed for index 6249 oluþôt
failed for index 6805 path
failed for index 6805 dependency
failed for index 6860 louis
failed for index 6860 louis
failed for index 6860 louis
failed for index 6860 louis
failed for index 6860 vote
failed for index 6860 for
failed for index 6860 at
failed for index 6860 the
failed for index 7342 ki
failed for index 7342 salit
failed for index 7724 vote
failed for index 8139

In [12]:
embeddings[0]

array([[ 0.18788284, -0.53500384, -1.7558887 ,  0.60100561, -0.38633257,
         0.23139137, -0.81437612,  0.64297569,  0.07032457,  0.04773977,
        -1.05327392, -0.0424287 ,  0.43430412, -0.98197538, -0.49964917,
         1.69865572],
       [ 0.37392178,  1.64002812, -0.06540881, -0.55984223, -0.75945103,
        -0.1807459 , -0.29696968,  0.35440907, -0.49492684,  0.54753685,
         0.79764587, -0.10759757,  0.36597726, -0.04434799, -0.58821917,
        -0.47940016],
       [-0.75858444, -8.82446861, -0.34861863, -2.29582238, -4.80535126,
        -1.32018757,  0.39596182, -1.58035338,  4.46426964,  2.9423492 ,
        -0.98106194,  1.85818172,  0.94007683, -0.51078355,  0.05281067,
         0.23841706],
       [ 0.39758903, -4.41240692,  4.40601683,  1.20621014, -2.23746324,
         4.38292217, -0.10465592,  1.60354233, -0.36593169, -0.71249837,
         2.4204514 , -1.48627484, -2.10400677,  1.09220123, -2.723629  ,
         1.4166882 ],
       [ 3.80756712,  2.06414962,  0

In [13]:
print(embeddings.shape)

(353969, 25, 16)


In [14]:
np.save("data/train_emb_matrix.npy", embeddings)

In [15]:
eval_data = pd.read_csv("data/evaluation.csv")

In [16]:
embeddings = np.zeros((eval_data.shape[0], max_words, pca_dim))

In [17]:
for idx in range(eval_data.shape[0]):
    sentence_embeddings = np.zeros((max_words, pca_dim))
    if idx % 10000 == 0:
        print(idx)
    for i, s in enumerate(eval_data.loc[idx, "text"].split()):
        try:
            sentence_embeddings[i, :] = my_pca.transform(model.wv[unidecode(s).lower()].reshape(1,-1))
        except:
            print("failed for index", idx)
    embeddings[idx] = sentence_embeddings

0
failed for index 7
failed for index 22
failed for index 38
failed for index 43
failed for index 44
failed for index 47
failed for index 47
failed for index 47
failed for index 47
failed for index 67
failed for index 78
failed for index 92
failed for index 93
failed for index 100
failed for index 112
failed for index 119
failed for index 138
failed for index 138
failed for index 138
failed for index 138
failed for index 176
failed for index 178
failed for index 193
failed for index 193
failed for index 193
failed for index 193
failed for index 195
failed for index 206
failed for index 222
failed for index 224
failed for index 229
failed for index 237
failed for index 253
failed for index 258
failed for index 258
failed for index 258
failed for index 259
failed for index 259
failed for index 280
failed for index 306
failed for index 316
failed for index 317
failed for index 323
failed for index 334
failed for index 337
failed for index 342
failed for index 342
failed for index 345
fail

In [18]:
np.save("data/eval_emb_matrix.npy", embeddings)

In [20]:
embeddings[1]

array([[ 2.25569367,  5.48288345, -2.60713315, -2.66803408, -3.15894294,
        -2.84726071,  0.75954908, -0.14912188, -0.4860276 , -0.64536434,
        -3.39429212, -1.68976998, -1.11796153, -2.40675831, -0.15898365,
         2.0557487 ],
       [ 2.84619069, -0.27029148, -0.25492272, -0.40458643, -0.62610298,
        -0.60682213,  0.03081203, -0.92525983, -1.1015954 , -3.33018708,
        -4.00231981,  0.89752948, -0.71969748,  0.14483207,  1.34603834,
         1.97560024],
       [ 0.56014609,  1.85157883, -0.0111733 , -0.77376455, -0.72682309,
         0.15696414, -0.41426504,  0.27449101, -0.20558165,  0.28410533,
         0.6031878 ,  0.0068297 ,  0.43076789, -0.06681381, -0.44297981,
        -0.49797654],
       [ 0.44780126,  1.54743886,  0.04196108, -0.65444654, -0.74868774,
        -0.21322222, -0.27224371,  0.42142701, -0.53246266,  0.46564788,
         0.86228007, -0.04881259,  0.23629555, -0.11557583, -0.54665369,
        -0.41208011],
       [ 0.        ,  0.        ,  0