In [None]:
import csv
import numpy as np
import pandas as pd

# Word2vec
from gensim.models import Word2Vec
from gensim.utils import tokenize
from unidecode import unidecode

In [None]:
# no sentence in the dataset has more than 40 words; almost every sentence has less than 20
max_words = 20
n_dim = 80    # embedding dimension
pca_dim = 16

In [None]:
train_data = pd.read_csv("data/train.csv")

In [None]:
sentences = [list(tokenize(s, deacc=True, lower=True)) for s in train_data['text']]
print(sentences[:5])

In [None]:
model = Word2Vec(vector_size=n_dim, window=20, min_count=1, workers=7)
model.build_vocab(sentences)
model.train(sentences, total_examples=model.corpus_count, epochs=5)

In [None]:
from sklearn.decomposition import PCA

In [None]:
# fiting pca
all_embeddings = []
for idx in range(train_data.shape[0]):
    sentence_embeddings = np.zeros((max_words, n_dim))
    for i, s in enumerate(train_data.loc[idx, "text"].split()):
        try:
            all_embeddings.append(model.wv[unidecode(s).lower()])
        except:
            pass

In [None]:
all_embeddings = np.stack(all_embeddings, axis=0)
print(all_embeddings.shape)

In [None]:
my_pca = PCA(n_components = pca_dim)
my_pca.fit(all_embeddings)

In [None]:
embeddings = np.zeros((train_data.shape[0], max_words, pca_dim))

In [None]:
for idx in range(train_data.shape[0]):
    sentence_embeddings = np.zeros((max_words, pca_dim))
    if idx % 10000 == 0:
        print(idx)
    for i, s in enumerate(train_data.loc[idx, "text"].split()):
        try:
            sentence_embeddings[i, :] = my_pca.transform(model.wv[unidecode(s).lower()].reshape(1,-1))
        except:
            print("failed for index", idx, s)
    embeddings[idx] = sentence_embeddings

In [None]:
embeddings[0]

In [None]:
print(embeddings.shape)

In [None]:
np.save("data/train_emb_matrix.npy", embeddings)

In [None]:
eval_data = pd.read_csv("data/evaluation.csv")

In [None]:
embeddings = np.zeros((eval_data.shape[0], max_words, pca_dim))

In [None]:
for idx in range(eval_data.shape[0]):
    sentence_embeddings = np.zeros((max_words, pca_dim))
    if idx % 10000 == 0:
        print(idx)
    for i, s in enumerate(eval_data.loc[idx, "text"].split()):
        try:
            sentence_embeddings[i, :] = my_pca.transform(model.wv[unidecode(s).lower()].reshape(1,-1))
        except:
            print("failed for index", idx)
    embeddings[idx] = sentence_embeddings

In [None]:
np.save("data/eval_emb_matrix.npy", embeddings)

In [None]:
embeddings[1]