### Load Gensim Library

In [1]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting FuzzyTM>=0.4.0
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
     ---------------------------------------- 67.1/67.1 kB 3.8 MB/s eta 0:00:00
Collecting fst-pso
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting simpful
  Downloading simpful-2.11.0-py3-none-any.whl (32 kB)
Collecting miniful
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fst-pso, miniful
  Building wheel for fst-pso (setup.py): started
  Building wheel for fst-pso (setup.py): finished with status 'done'
  Created wheel for fst-pso: filename=fst_pso-1.8.1-py3-none-any.whl size=20448 sha256=1101ed45816ecb4f7758c871b14e

In [2]:
import gensim
import warnings
warnings.filterwarnings('ignore')

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [4]:
# #This is needed only if you have uploaded data to Google drive
# from google.colab import drive
# drive.mount('/gdrive')

In [6]:
import pandas as pd

#change file path to point to where you have stored the zip file.
#df = pd.read_csv('/gdrive/My Drive/Statistical NLP AIML/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)
df = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


### Function to Clean up data

In [7]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)
    words = string.strip().lower().split()
    words = [w for w in words if len(w)>=1]
    return " ".join(words)
  except:
    return ""

### Clean the Data using routine above

In [8]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


In [9]:
df['clean_review'][0].split(' ')

['watching',
 'time',
 'chasers',
 'it',
 'obvious',
 'that',
 'it',
 'was',
 'made',
 'by',
 'a',
 'bunch',
 'of',
 'friends',
 'maybe',
 'they',
 'were',
 'sitting',
 'around',
 'one',
 'day',
 'in',
 'film',
 'school',
 'and',
 'said',
 'hey',
 'let',
 's',
 'pool',
 'our',
 'money',
 'together',
 'and',
 'make',
 'a',
 'really',
 'bad',
 'movie',
 'or',
 'something',
 'like',
 'that',
 'what',
 'ever',
 'they',
 'said',
 'they',
 'still',
 'ended',
 'up',
 'making',
 'a',
 'really',
 'bad',
 'movie',
 'dull',
 'story',
 'bad',
 'script',
 'lame',
 'acting',
 'poor',
 'cinematography',
 'bottom',
 'of',
 'the',
 'barrel',
 'stock',
 'music',
 'etc',
 'all',
 'corners',
 'were',
 'cut',
 'except',
 'the',
 'one',
 'that',
 'would',
 'have',
 'prevented',
 'this',
 'film',
 's',
 'release',
 'life',
 's',
 'like',
 'that']

### Convert Review to a Word List

In [10]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [11]:
print(len(documents[0]))

90


In [12]:
len(documents[1])

82

### Build the Model

In [13]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=5, #Ignore all words with total frequency lower than this
                               workers=6, #Number of CPU Cores
                               vector_size=300,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               epochs=10   #Number of iterations over the text corpus
                              )

2023-10-12 13:29:09,331 : INFO : collecting all words and their counts
2023-10-12 13:29:09,331 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-10-12 13:29:09,816 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2023-10-12 13:29:10,969 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2023-10-12 13:29:12,276 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2023-10-12 13:29:13,853 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2023-10-12 13:29:14,797 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2023-10-12 13:29:14,797 : INFO : Creating a fresh vocabulary
2023-10-12 13:29:15,069 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 39730 unique words (39.54% of original 100479, drops 60749)', 'datetime': '2023-10-12T13:29:15.069434',

2023-10-12 13:29:59,061 : INFO : EPOCH 6 - PROGRESS: at 12.32% examples, 1071294 words/s, in_qsize 11, out_qsize 0
2023-10-12 13:30:00,067 : INFO : EPOCH 6 - PROGRESS: at 24.92% examples, 1091838 words/s, in_qsize 11, out_qsize 0
2023-10-12 13:30:01,068 : INFO : EPOCH 6 - PROGRESS: at 37.34% examples, 1098414 words/s, in_qsize 11, out_qsize 0
2023-10-12 13:30:02,074 : INFO : EPOCH 6 - PROGRESS: at 49.33% examples, 1091719 words/s, in_qsize 11, out_qsize 0
2023-10-12 13:30:03,086 : INFO : EPOCH 6 - PROGRESS: at 62.20% examples, 1101288 words/s, in_qsize 12, out_qsize 0
2023-10-12 13:30:04,085 : INFO : EPOCH 6 - PROGRESS: at 75.16% examples, 1109287 words/s, in_qsize 11, out_qsize 0
2023-10-12 13:30:05,099 : INFO : EPOCH 6 - PROGRESS: at 89.27% examples, 1125552 words/s, in_qsize 11, out_qsize 0
2023-10-12 13:30:05,871 : INFO : EPOCH 6: training on 12084660 raw words (8901439 effective words) took 7.8s, 1137486 effective words/s
2023-10-12 13:30:06,876 : INFO : EPOCH 7 - PROGRESS: at 13.

# Exploring the model

### How many words in the model

### Get an embedding for a word

In [14]:
model.wv['great']

array([ 0.9340439 ,  1.9505477 , -0.00736295,  0.6101585 , -0.25910702,
       -0.3886241 , -0.3265341 ,  0.12851553,  0.57017404,  0.11075079,
       -0.30122545, -0.43801922, -0.04450225, -0.69500464, -0.9065121 ,
       -0.28703508, -0.9132009 , -0.32726163,  0.29176927,  3.2657232 ,
       -0.58498156,  1.185632  ,  0.33988285,  0.22536707,  0.6383159 ,
       -1.2813121 ,  0.8919571 , -0.6227101 , -0.57263803,  0.38123977,
       -0.47405538,  0.93096864,  1.147282  , -0.11467692,  0.30518672,
       -1.3225602 ,  0.91773826,  0.8565585 , -0.18484499, -0.82134837,
       -0.62599397,  0.47096428,  1.2624942 ,  1.8096796 , -0.573747  ,
       -0.37386048,  2.104689  ,  1.0607342 , -0.26057446, -1.3498862 ,
        0.20708452,  1.7004768 , -0.16937563, -1.1061263 , -0.1249475 ,
       -0.3827398 ,  1.3890784 ,  0.27682608,  0.38839015,  1.1529162 ,
       -0.40237474, -0.20653063,  2.4107227 , -0.8926548 ,  1.1806209 ,
       -1.5164781 , -0.7382644 , -0.3429672 , -0.2161049 , -1.64

In [15]:
model.wv['amazing']

array([-0.34445724,  0.9562085 , -0.44072908, -0.64268327, -0.15911838,
       -0.18859108,  0.2466808 ,  0.6474967 , -0.15904725,  0.7457768 ,
        1.0872959 ,  1.1635351 , -0.62339896, -1.1653187 , -0.38547698,
       -1.1845851 , -0.38001308,  0.84345424,  0.6352596 ,  1.991727  ,
       -1.5551342 ,  3.1559398 , -1.603768  ,  0.7919243 ,  0.5348292 ,
       -1.5007799 ,  0.7222638 , -0.32318202,  1.3323349 , -1.481886  ,
        0.72972775,  1.0583136 ,  1.0538667 , -0.59651315,  0.39259914,
        0.56652516, -0.13532285,  0.49241596, -0.20394138,  0.10522198,
        0.1897618 ,  1.1883155 , -0.2698425 ,  0.37930578, -0.05560923,
       -0.05542226, -0.19074444,  1.2780597 , -1.2374945 , -0.66063184,
       -0.9318162 , -0.16811845,  0.9811387 , -0.3844254 , -0.7232468 ,
       -1.0048751 ,  1.27439   ,  0.2639003 ,  1.288603  ,  0.09151676,
       -0.02096545, -0.18231228,  1.7550949 ,  0.93110186,  0.07972166,
       -0.45739117, -0.5135323 ,  0.30266497,  0.43631485, -0.75

### Finding Words which have similar meaning

In [16]:
model.wv.most_similar('amazing')

[('incredible', 0.7554730176925659),
 ('awesome', 0.7383137345314026),
 ('outstanding', 0.6877096891403198),
 ('astounding', 0.6864120960235596),
 ('fantastic', 0.6601110696792603),
 ('exceptional', 0.6563505530357361),
 ('astonishing', 0.6429214477539062),
 ('wonderful', 0.6254343390464783),
 ('excellent', 0.6199328899383545),
 ('brilliant', 0.5963668823242188)]

In [17]:
model.wv.most_similar('delhi')

[('nursemaid', 0.5565349459648132),
 ('recruits', 0.5438565015792847),
 ('donegal', 0.5406115651130676),
 ('copenhagen', 0.5396826267242432),
 ('houghland', 0.5393993258476257),
 ('hampshire', 0.5371832847595215),
 ('bluesmobile', 0.5207542777061462),
 ('cornwall', 0.5193893909454346),
 ('dodgers', 0.5106973648071289),
 ('upstate', 0.5092759132385254)]

### Find the word which is not like others

### Saving the model

In [18]:
model.save('word2vec-movie-50')

2023-10-12 13:30:27,500 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'word2vec-movie-50', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-10-12T13:30:27.500396', 'gensim': '4.3.0', 'python': '3.10.9 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:15) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-10-12 13:30:27,532 : INFO : storing np array 'vectors' to word2vec-movie-50.wv.vectors.npy
2023-10-12 13:30:27,614 : INFO : storing np array 'syn1neg' to word2vec-movie-50.syn1neg.npy
2023-10-12 13:30:27,693 : INFO : not storing attribute cum_table
2023-10-12 13:30:27,756 : INFO : saved word2vec-movie-50


In [19]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

2023-10-12 13:30:27,772 : INFO : loading Word2Vec object from word2vec-movie-50
2023-10-12 13:30:27,803 : INFO : loading wv recursively from word2vec-movie-50.wv.* with mmap=None
2023-10-12 13:30:27,803 : INFO : loading vectors from word2vec-movie-50.wv.vectors.npy with mmap=None
2023-10-12 13:30:27,850 : INFO : loading syn1neg from word2vec-movie-50.syn1neg.npy with mmap=None
2023-10-12 13:30:27,867 : INFO : setting ignored attribute cum_table to None
2023-10-12 13:30:28,306 : INFO : Word2Vec lifecycle event {'fname': 'word2vec-movie-50', 'datetime': '2023-10-12T13:30:28.306691', 'gensim': '4.3.0', 'python': '3.10.9 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:15) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'loaded'}


1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [20]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

[('queen', 0.39654266834259033),
 ('princess', 0.3893895745277405),
 ('commoner', 0.3810257911682129),
 ('countess', 0.339566707611084),
 ('prince', 0.3393735885620117),
 ('marian', 0.3381807208061218),
 ('bathory', 0.33671075105667114),
 ('maria', 0.33297955989837646),
 ('margaret', 0.3319658041000366),
 ('kingdom', 0.33136773109436035)]

In [21]:
model.wv.most_similar(positive=['woman', 'hero'], negative=['man'], topn=5)

[('heroine', 0.6117101907730103),
 ('protagonist', 0.455975204706192),
 ('prostitute', 0.38125619292259216),
 ('girl', 0.3805661201477051),
 ('blonde', 0.37097153067588806)]

In [22]:
model.wv.most_similar(positive=['woman','father'], negative=['man'])

[('mother', 0.6745065450668335),
 ('daughter', 0.6645011901855469),
 ('sister', 0.6038464307785034),
 ('aunt', 0.5925003886222839),
 ('wife', 0.573835015296936),
 ('grandmother', 0.5609205365180969),
 ('parents', 0.5491791367530823),
 ('mom', 0.54607093334198),
 ('spouse', 0.5437285304069519),
 ('niece', 0.5226768851280212)]

In [23]:
model.wv['king'] + model.wv['man'] - model.wv['queen']

array([-1.1686342 ,  2.1411211 , -0.28883746,  1.4134662 ,  1.75978   ,
       -0.67134666, -0.5164577 , -0.6464579 ,  4.855745  , -1.0908258 ,
       -2.2079506 ,  0.03765923, -0.38606465,  1.0855123 , -0.97990924,
        1.2339976 ,  0.7728735 , -2.1012058 ,  0.3082955 ,  2.2649484 ,
        0.9564961 ,  1.2661722 , -2.858633  ,  2.789435  , -0.0116322 ,
       -0.11419445, -2.396998  ,  0.9578802 , -1.1859361 , -2.7283516 ,
       -0.44740504, -2.2148554 ,  0.12460163, -0.00696051, -0.47772583,
       -2.4385753 , -1.2205238 ,  0.7728479 ,  0.9760928 ,  0.26081502,
       -1.393869  ,  1.183715  , -1.1287129 ,  1.6850381 ,  3.0410743 ,
        1.4824986 ,  0.38870978,  0.903841  , -0.02771685, -0.63997734,
        0.38547158,  0.5966231 , -2.2306185 , -0.4893683 , -2.5367908 ,
       -1.7304775 ,  1.5513626 , -1.9188254 ,  2.3878534 ,  2.1518598 ,
        0.7305798 , -0.6909317 , -1.4011097 ,  0.47485578,  0.4206596 ,
       -3.1418552 , -1.1196798 , -0.29143214, -1.222918  ,  0.68