# Gensim Word2Vec Tutorial
#### From https://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/

In [1]:
# intial imports 

import gzip
import gensim
import logging

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Data set from OpinRank
#### From kavita's PhD

In [6]:
# Taking a look at the file

input_file = './reviews_data.txt.gz'

with gzip.open (input_file, 'rb') as f:
        for i,line in enumerate (f):
            print(line)
            break

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

#### Read in the file

In [7]:
def read_input(input_file):
    """This method reads the input file which is in gzip format"""

    logging.info("reading file {0}...this may take a while".format(input_file))
    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):

            if (i % 10000 == 0):
                logging.info("read {0} reviews".format(i))
            # do some pre-processing and return list of words for each review
            # text. Preprocessing includes tokenization, lowercase
            yield gensim.utils.simple_preprocess(line)

In [8]:
# read in the reviews
documents = list(read_input(input_file))
logging.info("Complete")

2020-09-14 20:12:05,077 : INFO : reading file ./reviews_data.txt.gz...this may take a while
2020-09-14 20:12:05,078 : INFO : read 0 reviews
2020-09-14 20:12:06,726 : INFO : read 10000 reviews
2020-09-14 20:12:08,358 : INFO : read 20000 reviews
2020-09-14 20:12:10,248 : INFO : read 30000 reviews
2020-09-14 20:12:12,075 : INFO : read 40000 reviews
2020-09-14 20:12:14,150 : INFO : read 50000 reviews
2020-09-14 20:12:16,204 : INFO : read 60000 reviews
2020-09-14 20:12:18,013 : INFO : read 70000 reviews
2020-09-14 20:12:19,478 : INFO : read 80000 reviews
2020-09-14 20:12:21,028 : INFO : read 90000 reviews
2020-09-14 20:12:22,555 : INFO : read 100000 reviews
2020-09-14 20:12:24,031 : INFO : read 110000 reviews
2020-09-14 20:12:25,523 : INFO : read 120000 reviews
2020-09-14 20:12:27,062 : INFO : read 130000 reviews
2020-09-14 20:12:28,720 : INFO : read 140000 reviews
2020-09-14 20:12:30,256 : INFO : read 150000 reviews
2020-09-14 20:12:32,213 : INFO : read 160000 reviews
2020-09-14 20:12:33,7

#### Train Word2Vec model

In [10]:
# Instantiate the model
model = gensim.models.Word2Vec(documents,
                               size = 150, 
                               window = 10, 
                               min_count = 2, 
                               workers = 4)
# documents is the list of tokenized reviews
# size sets the dimensionaility of the word vectors
# window sets distance between current word and furthest related word
# min_count ignores words with frequency less than this -> kinda like a stopword removal
# threading

model.train(documents, total_examples = len(documents), epochs = 10)

2020-09-14 20:19:30,417 : INFO : collecting all words and their counts
2020-09-14 20:19:30,418 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-09-14 20:19:30,693 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2020-09-14 20:19:30,978 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2020-09-14 20:19:31,306 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2020-09-14 20:19:31,615 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2020-09-14 20:19:31,959 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2020-09-14 20:19:32,283 : INFO : PROGRESS: at sentence #60000, processed 11013726 words, keeping 76786 word types
2020-09-14 20:19:32,567 : INFO : PROGRESS: at sentence #70000, processed 12637528 words, keeping 83199 word types
2020-09-14 20:19:32,826 : INFO : PROG

2020-09-14 20:20:23,390 : INFO : EPOCH 2 - PROGRESS: at 29.53% examples, 1224192 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:20:24,391 : INFO : EPOCH 2 - PROGRESS: at 33.82% examples, 1223580 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:20:25,395 : INFO : EPOCH 2 - PROGRESS: at 38.16% examples, 1224019 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:20:26,395 : INFO : EPOCH 2 - PROGRESS: at 42.57% examples, 1224082 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:20:27,400 : INFO : EPOCH 2 - PROGRESS: at 46.94% examples, 1223191 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:20:28,401 : INFO : EPOCH 2 - PROGRESS: at 51.19% examples, 1223207 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:20:29,412 : INFO : EPOCH 2 - PROGRESS: at 55.34% examples, 1222929 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:20:30,412 : INFO : EPOCH 2 - PROGRESS: at 59.56% examples, 1222742 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:20:31,413 : INFO : EPOCH 2 - PROGRESS: at 63.96% examples, 1223609 words/s

2020-09-14 20:21:29,183 : INFO : EPOCH 4 - PROGRESS: at 96.58% examples, 1219840 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:21:29,967 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-09-14 20:21:29,975 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-09-14 20:21:29,976 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-09-14 20:21:29,980 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-09-14 20:21:29,981 : INFO : EPOCH - 4 : training on 41519358 raw words (30347059 effective words) took 24.9s, 1219979 effective words/s
2020-09-14 20:21:30,990 : INFO : EPOCH 5 - PROGRESS: at 3.94% examples, 1211382 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:21:31,997 : INFO : EPOCH 5 - PROGRESS: at 7.93% examples, 1220629 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:21:32,999 : INFO : EPOCH 5 - PROGRESS: at 11.39% examples, 1222358 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:21:34,001 : INFO : EPOCH 5 

2020-09-14 20:22:27,660 : INFO : EPOCH 2 - PROGRESS: at 29.50% examples, 1222514 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:22:28,661 : INFO : EPOCH 2 - PROGRESS: at 33.73% examples, 1219738 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:22:29,663 : INFO : EPOCH 2 - PROGRESS: at 37.91% examples, 1217283 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:22:30,667 : INFO : EPOCH 2 - PROGRESS: at 42.37% examples, 1217506 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:22:31,668 : INFO : EPOCH 2 - PROGRESS: at 46.79% examples, 1218578 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:22:32,675 : INFO : EPOCH 2 - PROGRESS: at 51.06% examples, 1218983 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:22:33,686 : INFO : EPOCH 2 - PROGRESS: at 55.14% examples, 1218454 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:22:34,689 : INFO : EPOCH 2 - PROGRESS: at 59.47% examples, 1219797 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:22:35,691 : INFO : EPOCH 2 - PROGRESS: at 63.75% examples, 1219412 words/s

2020-09-14 20:23:33,176 : INFO : EPOCH 4 - PROGRESS: at 89.21% examples, 1178064 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:23:34,178 : INFO : EPOCH 4 - PROGRESS: at 93.22% examples, 1177840 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:23:35,179 : INFO : EPOCH 4 - PROGRESS: at 97.17% examples, 1176327 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:23:35,852 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-09-14 20:23:35,859 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-09-14 20:23:35,860 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-09-14 20:23:35,862 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-09-14 20:23:35,862 : INFO : EPOCH - 4 : training on 41519358 raw words (30351140 effective words) took 25.8s, 1176596 effective words/s
2020-09-14 20:23:36,872 : INFO : EPOCH 5 - PROGRESS: at 3.82% examples, 1174530 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:23:37,872 : INFO : EPOCH 5

2020-09-14 20:24:33,536 : INFO : EPOCH 7 - PROGRESS: at 23.54% examples, 1148071 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:24:34,539 : INFO : EPOCH 7 - PROGRESS: at 27.54% examples, 1153748 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:24:35,540 : INFO : EPOCH 7 - PROGRESS: at 31.91% examples, 1157678 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:24:36,545 : INFO : EPOCH 7 - PROGRESS: at 35.91% examples, 1159731 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:24:37,552 : INFO : EPOCH 7 - PROGRESS: at 40.25% examples, 1164168 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:24:38,554 : INFO : EPOCH 7 - PROGRESS: at 44.73% examples, 1168518 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:24:39,556 : INFO : EPOCH 7 - PROGRESS: at 48.87% examples, 1169892 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:24:40,558 : INFO : EPOCH 7 - PROGRESS: at 52.89% examples, 1171891 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:24:41,562 : INFO : EPOCH 7 - PROGRESS: at 57.03% examples, 1172963 words/s

2020-09-14 20:25:38,337 : INFO : EPOCH 9 - PROGRESS: at 82.53% examples, 1198859 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:25:39,339 : INFO : EPOCH 9 - PROGRESS: at 86.50% examples, 1199123 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:25:40,343 : INFO : EPOCH 9 - PROGRESS: at 90.86% examples, 1199896 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:25:41,346 : INFO : EPOCH 9 - PROGRESS: at 94.91% examples, 1199395 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:25:42,352 : INFO : EPOCH 9 - PROGRESS: at 99.13% examples, 1199842 words/s, in_qsize 7, out_qsize 0
2020-09-14 20:25:42,537 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-09-14 20:25:42,545 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-09-14 20:25:42,549 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-09-14 20:25:42,553 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-09-14 20:25:42,554 : INFO : EPOCH - 9 : training on 41519358 

(303488934, 415193580)

In [11]:
model.save('checkpoint_model')

2020-09-14 20:27:24,155 : INFO : saving Word2Vec object under checkpoint_model, separately None
2020-09-14 20:27:24,156 : INFO : storing np array 'vectors' to checkpoint_model.wv.vectors.npy
2020-09-14 20:27:24,483 : INFO : not storing attribute vectors_norm
2020-09-14 20:27:24,484 : INFO : storing np array 'syn1neg' to checkpoint_model.trainables.syn1neg.npy
2020-09-14 20:27:24,769 : INFO : not storing attribute cum_table
2020-09-14 20:27:24,863 : INFO : saved checkpoint_model


In [12]:
model = gensim.models.Word2Vec.load('./checkpoint_model')

2020-09-14 20:28:27,160 : INFO : loading Word2Vec object from ./checkpoint_model
2020-09-14 20:28:27,924 : INFO : loading wv recursively from ./checkpoint_model.wv.* with mmap=None
2020-09-14 20:28:27,925 : INFO : loading vectors from ./checkpoint_model.wv.vectors.npy with mmap=None
2020-09-14 20:28:27,943 : INFO : setting ignored attribute vectors_norm to None
2020-09-14 20:28:27,944 : INFO : loading vocabulary recursively from ./checkpoint_model.vocabulary.* with mmap=None
2020-09-14 20:28:27,945 : INFO : loading trainables recursively from ./checkpoint_model.trainables.* with mmap=None
2020-09-14 20:28:27,945 : INFO : loading syn1neg from ./checkpoint_model.trainables.syn1neg.npy with mmap=None
2020-09-14 20:28:27,963 : INFO : setting ignored attribute cum_table to None
2020-09-14 20:28:27,963 : INFO : loaded ./checkpoint_model


#### Try out the model

In [13]:
word1 = 'dirty'
model.wv.most_similar(positive = word1)

2020-09-14 20:29:24,891 : INFO : precomputing L2-norms of word weight vectors


[('filthy', 0.860991358757019),
 ('unclean', 0.7903048992156982),
 ('stained', 0.7750240564346313),
 ('dusty', 0.7661241292953491),
 ('smelly', 0.7600818872451782),
 ('grubby', 0.7487136721611023),
 ('grimy', 0.7442931532859802),
 ('disgusting', 0.7237786054611206),
 ('dingy', 0.7185878753662109),
 ('mouldy', 0.7172408699989319)]

In [15]:
word2 = 'france'
model.wv.most_similar(word2)

[('canada', 0.6960472464561462),
 ('germany', 0.6532061100006104),
 ('spain', 0.6287896633148193),
 ('england', 0.6194642782211304),
 ('hawaii', 0.6143192052841187),
 ('mexico', 0.6083465814590454),
 ('rome', 0.5898810625076294),
 ('greece', 0.5866472721099854),
 ('russia', 0.5860298871994019),
 ('gaulle', 0.5834230184555054)]

In [16]:
pos = ['bed', 'sheet', 'pillow']
neg = ['couch']
model.wv.most_similar(positive=pos, negative = neg)

[('duvet', 0.7099647521972656),
 ('mattress', 0.6891525983810425),
 ('pillowcase', 0.6761701703071594),
 ('blanket', 0.6721630692481995),
 ('quilt', 0.6575330495834351),
 ('foam', 0.655534029006958),
 ('matress', 0.6552498936653137),
 ('sheets', 0.6460946798324585),
 ('pillows', 0.6417503356933594),
 ('quilts', 0.6067280769348145)]

In [18]:
model.wv.similarity(w1 = 'dirty', w2 = 'smelly')

0.7600818

In [19]:
model.wv.similarity(w1 = 'dirty', w2 = 'clean')

0.26938537

In [22]:
model.wv.doesnt_match(['cat','dog','france'])

'france'

In [23]:
model.wv.doesnt_match(['bed','pillow','duvet','shower'])

'shower'