## Converting tokenized reviews to word embeddings

In [13]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import os
import sys
np.set_printoptions(threshold=sys.maxsize)

from time import time

from gensim.models import Word2Vec
from utils import meaning_test
                          

t=time()

# READ IN REVIEWS
print('Loading Dataset...')
E_tokened = pd.read_json('data/Electronics_tokenized.json')
print('Dataset Loaded: ', round(time()-t,2),'s')

processed_corpus = E_tokened.tokened

print('Training Word Embeddings...', round(time()-t,2),'s')

w2v = Word2Vec(processed_corpus, # input tokenized text
                 min_count=5,   # Ignore words that appear less than this
                 size=300,      # Dimensionality of word embeddings
                 workers=99,     # Number of processors (parallelisation)
                 window=6,     # Context window for words during training
                 max_vocab_size= 40*1000, #
                 compute_loss=True,
                 iter=10)       

print('Trained', round(time()-t,2),'s')

print('# reviews: ', w2v.corpus_count)
print('Total words: ', w2v.corpus_total_words)
print('Latest Training Loss: ', w2v.get_latest_training_loss())
print('Vocab size: ', len(w2v.wv.vocab))

w2v.save('models/E_w2v_gensim10.model')
print('Saved', round(time()-t,2),'s')


print('For fun, and for to test the word embeddings....')
print('Words most similar to *windows*.')
print(meaning_test(['windows'], None,  w2v))
print()
print('Words most similar to *Mac*.')
print(meaning_test(['mac'], None,  w2v))
print()
print('Words most similar to *cat*.')
print(meaning_test('cat', None, w2v))
print()
print('Words most similar to *father*.')
print(meaning_test('father', None, w2v))
print()
print('father + mother - boy')
print(meaning_test(['father','mother'],['boy'],w2v))
print()
print("Thanks :D. You're ready to run the main program!")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading Dataset...
Dataset Loaded:  62.4 s
Training Word Embeddings... 62.41 s
Trained 63.08 s
# reviews:  2836144
Total words:  81666138
Latest Training Loss:  12094783.0
Vocab size:  18053
Saved 63.08 s
For fun, and for to test the word embeddings....
Words most similar to *windows*.
    words          similarity
0   winxp  0.6862816214561462
1   vista  0.6787523031234741
2  ubuntu  0.6680481433868408
3      os  0.6254222989082336
4   linux  0.6235475540161133

Words most similar to *Mac*.
          words          similarity
0          imac   0.687383770942688
1          macs  0.6585511565208435
2     mavericks  0.6501476764678955
3  snow_leopard  0.6488965153694153
4           osx   0.646578311920166

Words most similar to *cat*.
    words          similarity
0     utp  0.6351267695426941
1    gnaw  0.5743775963783264
2  kitten  0.5678672790527344
3   kitty  0.5519692301750183
4  baluns  0.531767

# <br>
# EDA

In [7]:
# # LOAD A PRETRAINED MODEL TO SAVE TIME
# from time import time
# import numpy as np
# from gensim.models import Word2Vec
# w2v = Word2Vec.load('models/E_w2v_gensim10_2.model')
# E_tokened = pd.read_json('data/Electronics_tokenized.json')
# processed_corpus = E_tokened.tokened

## All ngrams before Word2Vec

In [17]:
from utils import kw2counts

ngrams = []
for review in processed_corpus:
    for tok in review:
        if '_' in tok:
            ngrams.append(tok)
            
len(np.array(list(set(ngrams)))), np.array(list(set(ngrams)))

(1616, array(['car_dvr', 'cm_cr_dp_cmt', 'nashville_tn', 'a_e', 'as_li_tl',
        'a_s', 'seeyqkz_a', 'um_rcr', 'infinity_primus', 'number_',
        'san_jose', 'southern_california', 'highres_', 'w___',
        'paul_frank', 'schiit_bifrost', 'fix_the_knob', 'dknight_magicbox',
        'muy_bien', 'speexrate_best', 'speaker_', 'sams_club', 'tek_pal',
        'cm_sw_su_dp', 'aag_m_pw_dp', 'cs_id', 'model_search', 'ni_cad',
        'mdrzx_blk', 'isport_victory', 'pin_', 'baofeng_uv', 'dot_clean',
        'j_hilbert', 'ya_aw_oh_pii', 'trails_', 'h_d_t_v', 'the_road',
        'boco_fw_wcb', 'ignore_db', 'bruno_mar', 'mdr_v', 'yyy_seq',
        'solitude_xcs', 'snow_leopard', 'play_record', 'four_hundred',
        'zadig_xp_v', 'superlux_hd', 'def_tech', 'c_name', 'a_id',
        'better_bought', 'pd_sim_sbs_pc_', 'needlenose_pliers',
        'bafghwdh_', 'jay_bird', 're_order', 'lepai_tripath', 'powerex_mh',
        'scotland_ace', 'x_', 'natalie_wilson_', 'kdl_hx', 'meaning_don',
    

In [18]:
counts = kw2counts(processed_corpus)
ngram_counts = np.array([tup for tup in counts if '_' in tup[0]])
ngram_counts.shape, ngram_counts

((1616, 2), array([['skull_candy', '4827'],
        ['altec_lansing', '4221'],
        ['http_www', '4219'],
        ['tripp_lite', '3038'],
        ['swag_swag', '2987'],
        ['harman_kardon', '2767'],
        ['wal_mart', '2639'],
        ['harmon_kardon', '1605'],
        ['co_workers', '1329'],
        ['li_ion', '1297'],
        ['lithium_ion', '1288'],
        ['raspberry_pi', '1278'],
        ['fiber_optic', '1219'],
        ['vice_versa', '1198'],
        ['ni_mh', '1184'],
        ['martin_logan', '1160'],
        ['sol_republic', '1123'],
        ['videoid_mo', '1093'],
        ['daisy_chain', '1081'],
        ['allen_wrench', '1047'],
        ['cambridge_soundworks', '1043'],
        ['cerwin_vega', '998'],
        ['stainless_steel', '912'],
        ['co_worker', '887'],
        ['blah_blah', '723'],
        ['curl_iron', '685'],
        ['hello_kitty', '685'],
        ['sr_', '682'],
        ['south_africa', '653'],
        ['utf_qid', '637'],
        ['antennaweb_org'

## All ngrams after Word2Vec - from ~1600 to 440

In [14]:
t = time()
print(time()-t,'s')
vocabW = np.array(sorted(list(w2v.wv.vocab.keys())))
print(time()-t,'s')
Wbigrams = np.array(list(set([str(gram) for gram in vocabW if str(gram).count('_') > 0])))
Wtrigrams = np.array(list(set([str(gram) for gram in vocabW if str(gram).count('_') > 1])))
print(time()-t,'s')
len(Wbigrams), len(Wtrigrams)

4.9114227294921875e-05 s
0.01564192771911621 s
0.04405784606933594 s


(440, 5)

In [15]:
Wbigrams

array(['oh_details_o', 'tivo_roamio', 'taiyo_yuden', 'soundmatters_foxl',
       'nvidia_gtx', 'tx_nr', 'honda_civic', 'as_li_tl', 'co_ax',
       'planar_magnetic', 'sewell_deadbolt', 'slimx_imp',
       'infinity_primus', 'nomad_iic', 'tornado_alley',
       'zombie_apocalypse', 'ultrasone_hfi', 'vsonic_gr', 'str_dh',
       'san_jose', 'restock_fee', 'southern_california',
       'thermal_compound', 'particle_board', 'hifimediy_sabre',
       'schiit_bifrost', 'dknight_magicbox', 'arduino_uno', 'muy_bien',
       'gordon_brothers', 'sams_club', 'tek_pal', 'jet_engines',
       'golf_cart', 'inkjet_printable', 'honda_odyssey', 'el_precio',
       'ali_julia', 'ni_cad', 'winegard_flatwave', 'singer_songwriter',
       'slam_dunk', 'injection_mold', 'isport_victory', 'ta_da',
       'harmon_kardon', 'fred_meyer', 'baofeng_uv', 'itty_bitty',
       'jurassic_park', 'otterbox_defender', 'cyber_acoustics',
       'lenovo_yoga', 'time_set', 'schiit_modi', 'fiber_optic',
       'salvation_a