## From D11_full_attempt

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import os
import sys
np.set_printoptions(threshold=sys.maxsize)

from time import time

from gensim.models import Word2Vec
from utils import meaning_test
                          

t=time()

# READ IN REVIEWS
print('Loading Dataset...')
E_tokened = pd.read_json('data/Electronics_tokenized_2.json')
print('Dataset Loaded: ', round(time()-t,2),'s')

processed_corpus = E_tokened.tokened

print('Training Word Embeddings...', round(time()-t,2),'s')

# # mc3, sz300, iter10, maxvocab10k
# w2v = Word2Vec(processed_corpus, # input tokenized text
#                  min_count=5,   # Ignore words that appear less than this
#                  size=300,      # Dimensionality of word embeddings
#                  workers=99,     # Number of processors (parallelisation)
#                  window=6,     # Context window for words during training
#                  max_vocab_size= 40*1000, #
#                  compute_loss=True,
#                  iter=10)       

w2v = Word2Vec.load('models/E_w2v_gensim10_2.model')
print('Trained', round(time()-t,2),'s')

print('# reviews: ', w2v.corpus_count)
print('Total words: ', w2v.corpus_total_words)
print('Latest Training Loss: ', w2v.get_latest_training_loss())
print('Vocab size: ', len(w2v.wv.vocab))

# w2v.save('models/E_w2v_gensim10_2.model')
print('Saved', round(time()-t,2),'s')


print('For fun, and for to test the word embeddings....')
print('Words most similar to *windows*.')
print(meaning_test(['windows'], None,  w2v))
print()
print('Words most similar to *Mac*.')
print(meaning_test(['mac'], None,  w2v))
print()
print('Words most similar to *cat*.')
print(meaning_test('cat', None, w2v))
print()
print('Words most similar to *father*.')
print(meaning_test('father', None, w2v))
print()
print('father + mother - boy')
print(meaning_test(['father','mother'],['boy'],w2v))
print()
print("Thanks :D. You're ready to run the main program!")

Loading Dataset...
Dataset Loaded:  76.93 s
Training Word Embeddings... 76.93 s
Trained 77.61 s
# reviews:  2836144
Total words:  81666138
Latest Training Loss:  12094783.0
Vocab size:  18053
Saved 77.61 s
For fun, and for to test the word embeddings....
Words most similar to *windows*.
    words          similarity
0   winxp  0.6862816214561462
1   vista  0.6787523031234741
2  ubuntu  0.6680481433868408
3      os  0.6254222989082336
4   linux  0.6235475540161133

Words most similar to *Mac*.
          words          similarity
0          imac   0.687383770942688
1          macs  0.6585511565208435
2     mavericks  0.6501476764678955
3  snow_leopard  0.6488965153694153
4           osx   0.646578311920166

Words most similar to *cat*.
    words          similarity
0     utp  0.6351267695426941
1    gnaw  0.5743775963783264
2  kitten  0.5678672790527344
3   kitty  0.5519692301750183
4  baluns  0.5317671895027161

Words most similar to *father*.
    words          similarity
0  mother  0.

# <br>
# Validation : ngrams

### After:

In [3]:
t = time()
print(time()-t,'s')
vocabW = np.array(sorted(list(w2v.wv.vocab.keys())))
print(time()-t,'s')
Wbigrams = np.array(list(set([str(gram) for gram in vocabW if str(gram).count('_') > 0])))
Wtrigrams = np.array(list(set([str(gram) for gram in vocabW if str(gram).count('_') > 1])))
print(time()-t,'s')
len(Wbigrams), len(Wtrigrams)

4.649162292480469e-05 s
0.01406240463256836 s
0.04240608215332031 s


(440, 5)

In [4]:
(616, 22) # 1, 50k
(532, 5) # 3 , 50k
(33, 0) # 3 10k
(440, 5) # 5, 40k

(440, 5)

In [5]:
Wbigrams

array(['howard_stern', 'daft_punk', 'homeworx_hw', 'lenovo_yoga',
       'weed_eater', 'ali_julia', 'god_forbid', 'supra_aural',
       'parrot_ziks', 'ogg_vorbis', 'bic_acoustech', 'time_set',
       'pete_sake', 'vault_ceilings', 'kinivo_bn', 'jabra_revo',
       'nintendo_ds', 'skull_crushers', 'deadbolt_banana', 'el_producto',
       'dell_xps', 'adversely_affect', 'mech_mod', 'pacific_rim',
       'vice_versa', 'ge_superadio', 'bang_olufsen', 'honda_civic',
       'winegard_lna', 'frank_sinatra', 'asus_xonar', 'schiit_asgard',
       'american_idol', 'lc_le', 'anodize_aluminum', 'as_li_tl',
       'braven_brv', 'toyota_tacoma', 'dd_wrt', 'roomba_roomba',
       'harmon_kardon', 'neodymium_magnets', 'str_dh', 'noontec_zoro',
       'yst_sw', 'slam_dunk', 'shoe_lace', 'faux_leather',
       'soundmatters_foxl', 'droid_razr', 'bq_cc', 'ibasso_dx',
       'isport_victory', 'honda_odyssey', 'bach_toccata', 'sc_hc',
       'lo_recomiendo', 'pvc_pipe', 'elliptical_trainer',
       'rocke

### Before: 

In [6]:
ngrams = []
for review in processed_corpus:
    for tok in review:
        if '_' in tok:
            ngrams.append(tok)
            
len(np.array(list(set(ngrams)))), np.array(list(set(ngrams)))

from utils import kw2counts
counts = kw2counts(processed_corpus)
ngram_counts = np.array([tup for tup in counts if '_' in tup[0]])
ngram_counts.shape, ngram_counts

((1710, 2), array([['skull_candy', '4827'],
        ['altec_lansing', '4221'],
        ['http_www', '4219'],
        ['timely_manner', '3492'],
        ['dr_dre', '3332'],
        ['tripp_lite', '3038'],
        ['swag_swag', '2987'],
        ['harman_kardon', '2767'],
        ['wal_mart', '2639'],
        ['mohu_leaf', '2264'],
        ['tx_nr', '2111'],
        ['boston_acoustics', '1726'],
        ['harmon_kardon', '1605'],
        ['ie_utf', '1570'],
        ['airport_express', '1516'],
        ['strain_relief', '1516'],
        ['co_workers', '1329'],
        ['li_ion', '1297'],
        ['turtle_beach', '1291'],
        ['lithium_ion', '1288'],
        ['raspberry_pi', '1278'],
        ['fiber_optic', '1219'],
        ['vice_versa', '1198'],
        ['ni_mh', '1184'],
        ['martin_logan', '1160'],
        ['college_student', '1145'],
        ['sol_republic', '1123'],
        ['sine_wave', '1104'],
        ['videoid_mo', '1093'],
        ['verizon_fios', '1080'],
        ['ciga

In [None]:
# # perform another 11 epochs of training
# t = time()
# w2v.train(processed_corpus, epochs=12, total_examples=w2v_gensim.corpus_count)
# w2v.save('saved_data_models/D11E_w2v_gensim13_nostem.model')
# print( time()-t,'s')