In [8]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
from itertools import islice
import numpy as np
import nltk
import pickle
import sys

In [2]:
train_set = pd.read_csv("dataset_train_pp.csv")
test_set = pd.read_csv("dataset_test_pp.csv")

In [3]:
print(len(train_set))
print(len(test_set))

50000
5000


In [4]:
# Separating input and label
train_x=train_set["Description"]
test_x=test_set["Description"]

train_y=train_set["Class Index"]
test_y=test_set["Class Index"]

In [6]:
%%time
wv = gensim.models.KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin.gz", binary=True, limit=1000000)

CPU times: user 28.6 s, sys: 1.81 s, total: 30.4 s
Wall time: 34.1 s


In [10]:
# Precompute L2-normalized vectors. You cannot continue training after doing a replace. 
# The model becomes effectively read-only: you can call most_similar(), similarity(), etc., but not train.
wv.init_sims(replace=True)

In [11]:
# view some vocabularies
list(islice(wv.vocab, 10, 20))

['recipes',
 'Gordon_Engelhardt',
 'Alan_Sayre',
 'newly_christened',
 'dunking_sensation',
 'whimsicality',
 'Kortrijk',
 'Environment_Rating_CBBER',
 'Koukalova',
 'Woodburn_Ind.']

In [12]:
def tokenizer(text):
    tokens_list = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            tokens_list.append(word)
    return tokens_list

In [13]:
%%time
test_x_t = test_x.apply(tokenizer).tolist()
train_x_t = train_x.apply(tokenizer).tolist()

CPU times: user 43.7 s, sys: 190 ms, total: 43.9 s
Wall time: 44 s


In [16]:
# wv.vectors_norm or wv.syn0norm: unit-normaliyed vector of a vocabulary word, returns a 300 dim vector
# gensim.matutils.unitvec : Scale a vector to unit length.
def word_averaging(wv, words_list):
    avg_all = []
    unk = 0
    for words in words_list:
        norm_words = []
        for token in words:
            if token in wv.vocab:
                norm_tokens = wv.vectors_norm[wv.vocab[token].index]
                norm_words.append(norm_tokens)
            else:
                norm_words.append(np.zeros(300))
                unk += 1
                
        avg_words = gensim.matutils.unitvec(abs(np.array(norm_words).mean(axis=0))).astype(np.float32)   
        avg_all.append(avg_words)
    print("No. of unknown words:", unk)
    return np.array(avg_all)

In [17]:
%%time
w2v_test_x = word_averaging(wv, test_x_t)

No. of unknown words: 26086
CPU times: user 1.72 s, sys: 7.89 ms, total: 1.73 s
Wall time: 1.73 s


In [18]:
%%time
w2v_train_x = word_averaging(wv, train_x_t)

No. of unknown words: 261172
CPU times: user 15.9 s, sys: 98.9 ms, total: 16 s
Wall time: 16 s


#### save w2v matrix to file

In [19]:
np.save("w2v_train_x.npy", w2v_train_x)
np.save("w2v_test_x.npy", w2v_test_x)

In [None]:
tfidf_test_x = np.load("w2v_test_x.npy")

In [None]:
tfidf_train_x = np.load("w2v_train_x.npy")

In [20]:
w2v_test_x

array([[0.03507248, 0.02853259, 0.02540481, ..., 0.06819418, 0.00254362,
        0.01496225],
       [0.02973051, 0.04870571, 0.01827599, ..., 0.02830017, 0.00388521,
        0.01688914],
       [0.02775904, 0.01455307, 0.00308727, ..., 0.05806882, 0.05713799,
        0.08039328],
       ...,
       [0.00198855, 0.00832901, 0.00993463, ..., 0.00039381, 0.0263018 ,
        0.03657723],
       [0.00155745, 0.06449332, 0.08638214, ..., 0.01972472, 0.0587306 ,
        0.03280148],
       [0.04483974, 0.06736824, 0.00223428, ..., 0.02030664, 0.06246478,
        0.00327823]], dtype=float32)

In [21]:
print(type(w2v_test_x))
print(len(w2v_test_x))
print(len(w2v_test_x[0]))
print(w2v_test_x.shape)

<class 'numpy.ndarray'>
5000
300
(5000, 300)
