In [1]:
from operator import add

import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import gensim

# Word2Vec

In this notebook, we will train a word2vec model and save the processed data to avoid having to run this every time.

We will follow the 300 features we established in the ComparingPretrainedModels notebook

Steps:

1. Model training
2. We will pass each token for each row of the dataset to the model
3. After having all the tokens to vectors, we will aggregate those vectors with a mean to have a text vector representation

The train and test index will be saved to keep the same in the ComparingPretrainedModels. More explained at that notebook.

In [2]:
df = pd.read_csv('processed_text_with_all.csv')
df = df.dropna()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.Text, df.Labels, test_size=0.15, random_state=99)

The classes has been divided keeping the proportion of classes in both sets

In [4]:
y_train.value_counts() / y_train.shape[0]

1.0    0.843087
0.0    0.156913
Name: Labels, dtype: float64

In [5]:
y_test.value_counts() / y_test.shape[0]

1.0    0.843359
0.0    0.156641
Name: Labels, dtype: float64

In [None]:
sentences = X_train.apply(word_tokenize)

In [None]:
w2v = Word2Vec(min_count=1,
               window=10,
               vector_size=300,
               sample=6e-5,
               alpha=0.03,
               min_alpha=0.0007,
               negative=20)
w2v.build_vocab(sentences, progress_per=100)
w2v.train(sentences, total_examples=w2v.corpus_count, epochs=30, report_delay=1)

(143477739, 308985210)

In [None]:
w2v.save("w2v_30epochs_window10_size300.model")

In [17]:
def word2vec_get_vector(w2v, word):
    try:
        word = w2v.wv.get_vector(word)
    except KeyError:
        word = [0] * model.wv.vector_size
    return word


def ind_sentence_process(sentence, w2v):
    sent_vec = word2vec_get_vector(w2v, sentence[0])
    for word in sentence[1:]:
        sent_vec = list(map(add, sent_vec, word2vec_get_vector(w2v, word)))
    return list(map(lambda x: x / len(sentence), sent_vec))

In [18]:
def text_to_vec(X, model):
    X_wv = X.apply(word_tokenize).apply(ind_sentence_process, w2v=model)
    X_wv = pd.DataFrame(X_wv.tolist())
    return X_wv


def text_to_vec_workflow(data_dict, model):
    data_wv_dict = {}
    for dataset_type, dataset in data_dict.items():
        data_wv = text_to_vec(dataset, model)
        data_wv.index = dataset.index
        data_wv.to_csv(f'w2v_window10_{dataset_type}_data.csv')
        data_wv_dict[dataset_type] = data_wv
    return data_wv_dict

In [19]:
# Load pre-trained Word2Vec model.
model = gensim.models.Word2Vec.load("w2v_30epochs_window10_size300.model")
data_dict = {'train': X_train, 'test': X_test}  # , 'val': X_val

data_wv_dict = text_to_vec_workflow(data_dict, model)