In [1]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
from itertools import islice
import numpy as np
import nltk
import pickle
import sys

In [2]:
train_set = pd.read_csv("dataset_train_pp.csv")
test_set = pd.read_csv("dataset_test_pp.csv")

In [3]:
print(len(train_set))
print(len(test_set))

50000
5000


In [4]:
# Separating input and label
train_x=train_set["Description"]
test_x=test_set["Description"]

train_y=train_set["Class Index"]
test_y=test_set["Class Index"]

In [5]:
def tokenizer(text):
    tokens_list = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            tokens_list.append(word)
    return tokens_list

In [6]:
%%time
test_x_t = test_x.apply(tokenizer).tolist()
train_x_t = train_x.apply(tokenizer).tolist()

CPU times: user 46 s, sys: 280 ms, total: 46.3 s
Wall time: 46.5 s


In [7]:
# corpus = train_x_t + test_x_t
corpus = train_x_t

In [8]:
corpus_len = 0
for i in corpus:
    corpus_len += len(i)
corpus_len

3648948

In [15]:
vec_size = 300

In [16]:
%%time
wv = Word2Vec(corpus, size=vec_size, window=5, min_count=3)

CPU times: user 3min 4s, sys: 555 ms, total: 3min 5s
Wall time: 1min 4s


In [17]:
p = pickle.dumps(wv)
memoryKB = sys.getsizeof(p)/1000
print(memoryKB)

89994.213


In [33]:
wv.save("w2v_115.model")

In [26]:
wv = wv.wv

In [27]:
# Precompute L2-normalized vectors. You cannot continue training after doing a replace. 
# The model becomes effectively read-only: you can call most_similar(), similarity(), etc., but not train.
wv.init_sims(replace=True)

In [28]:
# view some vocabularies
list(islice(wv.vocab, 10, 20))

['uvc',
 'maui',
 'domachowska',
 'smb',
 'occupancy',
 'lastditch',
 'collapse',
 'legendary',
 'lastweek',
 'englishlanguage']

In [29]:
# wv.vectors_norm or wv.syn0norm: unit-normaliyed vector of a vocabulary word, returns a 300 dim vector
# gensim.matutils.unitvec : Scale a vector to unit length.
def word_averaging(wv, words_list):
    avg_all = []
    unk = 0
    for words in words_list:
        norm_words = []
        for token in words:
            if token in wv.vocab:
                norm_tokens = wv.vectors_norm[wv.vocab[token].index]
                norm_words.append(norm_tokens)
            else:
                norm_words.append(np.zeros(vec_size))
                unk += 1
                
        avg_words = gensim.matutils.unitvec(abs(np.array(norm_words).mean(axis=0))).astype(np.float32)   
        avg_all.append(avg_words)
    print("No. of unknown words:", unk)
    return np.array(avg_all)

In [30]:
%%time
w2v_test_x = word_averaging(wv, test_x_t)

No. of unknown words: 8451
CPU times: user 1.59 s, sys: 4.06 ms, total: 1.6 s
Wall time: 1.6 s


In [31]:
%%time
w2v_train_x = word_averaging(wv, train_x_t)

No. of unknown words: 57080
CPU times: user 13.6 s, sys: 15.8 ms, total: 13.7 s
Wall time: 13.7 s


#### save w2v matrix to file

In [32]:
np.save("w2v_train_x_115.npy", w2v_train_x)
np.save("w2v_test_x_115.npy", w2v_test_x)

In [None]:
tfidf_test_x = np.load("w2v_test_x.npy")

In [None]:
tfidf_train_x = np.load("w2v_train_x.npy")

In [30]:
w2v_test_x

array([[0.0583921 , 0.12589933, 0.04027877, ..., 0.08168618, 0.03448169,
        0.08376984],
       [0.05356431, 0.08018451, 0.01231005, ..., 0.10732461, 0.02249216,
        0.05081633],
       [0.1155323 , 0.11814706, 0.06095942, ..., 0.13833633, 0.02144026,
        0.10468481],
       ...,
       [0.02837246, 0.03734729, 0.02315114, ..., 0.06863417, 0.05062385,
        0.09718889],
       [0.01824571, 0.09912425, 0.01130349, ..., 0.06550372, 0.04749638,
        0.08120343],
       [0.01502722, 0.04912486, 0.0316234 , ..., 0.14490038, 0.06974927,
        0.14227726]], dtype=float32)

In [31]:
print(type(w2v_test_x))
print(len(w2v_test_x))
print(len(w2v_test_x[0]))
print(w2v_test_x.shape)

<class 'numpy.ndarray'>
5000
300
(5000, 300)
