### Load Gensim Library

In [None]:
!pip install gensim --quiet

In [None]:
import gensim

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_path = '/content/drive/MyDrive/0.Latest_DS_Course/NLP/NoteBooks/data/unlabeledTrainData.tsv.zip'


In [None]:
import pandas as pd

#change file path to point to where you have stored the zip file.
df = pd.read_csv(data_path, header=0, delimiter="\t", quoting=3)

In [None]:

print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [None]:
df.shape

(50000, 2)

In [None]:
df.loc[0, 'review']

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

### Function to Clean up data

In [None]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)
    words = string.strip().lower().split()
    words = [w for w in words if len(w)>=1]
    return " ".join(words)
  except:
    return ""

### Clean the Data using routine above

In [None]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


In [None]:
df.shape

(50000, 3)

In [None]:
df.to_csv("test.csv")

### Convert Review to a Word List

In [None]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

In [None]:
print(len(documents))

50000


In [None]:
print(documents[1])

['i', 'saw', 'this', 'film', 'about', 'years', 'ago', 'and', 'remember', 'it', 'as', 'being', 'particularly', 'nasty', 'i', 'believe', 'it', 'is', 'based', 'on', 'a', 'true', 'incident', 'a', 'young', 'man', 'breaks', 'into', 'a', 'nurses', 'home', 'and', 'rapes', 'tortures', 'and', 'kills', 'various', 'women', 'br', 'br', 'it', 'is', 'in', 'black', 'and', 'white', 'but', 'saves', 'the', 'colour', 'for', 'one', 'shocking', 'shot', 'br', 'br', 'at', 'the', 'end', 'the', 'film', 'seems', 'to', 'be', 'trying', 'to', 'make', 'some', 'political', 'statement', 'but', 'it', 'just', 'comes', 'across', 'as', 'confused', 'and', 'obscene', 'br', 'br', 'avoid']


In [None]:
documents[:2]

[['watching',
  'time',
  'chasers',
  'it',
  'obvious',
  'that',
  'it',
  'was',
  'made',
  'by',
  'a',
  'bunch',
  'of',
  'friends',
  'maybe',
  'they',
  'were',
  'sitting',
  'around',
  'one',
  'day',
  'in',
  'film',
  'school',
  'and',
  'said',
  'hey',
  'let',
  's',
  'pool',
  'our',
  'money',
  'together',
  'and',
  'make',
  'a',
  'really',
  'bad',
  'movie',
  'or',
  'something',
  'like',
  'that',
  'what',
  'ever',
  'they',
  'said',
  'they',
  'still',
  'ended',
  'up',
  'making',
  'a',
  'really',
  'bad',
  'movie',
  'dull',
  'story',
  'bad',
  'script',
  'lame',
  'acting',
  'poor',
  'cinematography',
  'bottom',
  'of',
  'the',
  'barrel',
  'stock',
  'music',
  'etc',
  'all',
  'corners',
  'were',
  'cut',
  'except',
  'the',
  'one',
  'that',
  'would',
  'have',
  'prevented',
  'this',
  'film',
  's',
  'release',
  'life',
  's',
  'like',
  'that'],
 ['i',
  'saw',
  'this',
  'film',
  'about',
  'years',
  'ago',
  'and

### Build the Model

In [None]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this
                               workers=4, #Number of CPU Cores
                               vector_size=50,  #Embedding size
                               window=5, #Neighbours on the left and right
                               epochs=10   #Number of iterations over the text corpus
                              )

In [None]:
#documents[0]

# Exploring the model

### How many words in the model

In [None]:
#Model size
model.wv.vectors.shape

(28322, 50)

In [None]:
# Get the vocabulary (list of words)
vocabulary = model.wv.index_to_key

In [None]:
vocabulary

['the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'br',
 'it',
 'in',
 'i',
 'this',
 'that',
 's',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'you',
 't',
 'on',
 'not',
 'he',
 'are',
 'his',
 'have',
 'be',
 'one',
 'all',
 'they',
 'at',
 'by',
 'who',
 'an',
 'from',
 'so',
 'like',
 'there',
 'or',
 'her',
 'just',
 'about',
 'out',
 'has',
 'if',
 'what',
 'some',
 'good',
 'can',
 'when',
 'more',
 'very',
 'she',
 'up',
 'no',
 'time',
 'even',
 'would',
 'their',
 'my',
 'which',
 'story',
 'only',
 'really',
 'see',
 'had',
 'were',
 'well',
 'me',
 'we',
 'than',
 'much',
 'bad',
 'been',
 'get',
 'people',
 'great',
 'into',
 'also',
 'do',
 'other',
 'first',
 'will',
 'him',
 'because',
 'most',
 'how',
 'don',
 'them',
 'make',
 'its',
 'made',
 'way',
 'could',
 'then',
 'too',
 'movies',
 'after',
 'any',
 'characters',
 'character',
 'films',
 'two',
 'think',
 'watch',
 'being',
 'plot',
 'many',
 'where',
 'never',
 'love',
 'seen',
 'little',
 'life',
 

In [None]:
len(vocabulary)

28322

### Get an embedding for a word

In [None]:
model.wv['music']

array([-4.213608  ,  2.9245765 ,  2.6493273 , -2.8945906 , -0.88977456,
        1.0671237 , -0.36196393,  1.368745  , -1.304022  , -1.7089509 ,
        0.365253  ,  2.1561697 , -0.08426645, -0.7854228 ,  0.65190005,
        0.3239958 ,  1.3258431 ,  3.7926674 , -3.2370896 ,  3.4149377 ,
       -1.0009738 ,  3.0579047 ,  0.17481916,  1.7609447 , -1.7085437 ,
       -1.240299  ,  1.5306859 , -2.9738276 , -3.1672673 ,  0.42668617,
        4.020117  , -0.16568936,  1.520159  , -0.11908525,  2.4441464 ,
       -0.06501248,  0.4855096 , -3.4305384 ,  0.16422455,  0.08144433,
       -0.36279386, -0.62882006,  0.1999356 ,  4.5418067 ,  0.19337748,
        2.2821465 , -0.06174718,  1.0918744 ,  3.0037332 ,  2.7700474 ],
      dtype=float32)

### Finding Words which have similar meaning

In [None]:
model.wv.most_similar('great', topn=15)

[('fantastic', 0.8881475329399109),
 ('terrific', 0.8710678815841675),
 ('wonderful', 0.8687665462493896),
 ('fine', 0.8461161255836487),
 ('good', 0.8281927704811096),
 ('brilliant', 0.8116652965545654),
 ('superb', 0.789010763168335),
 ('perfect', 0.7609360218048096),
 ('nice', 0.7528983354568481),
 ('amazing', 0.7489042282104492),
 ('remarkable', 0.7284606099128723),
 ('marvelous', 0.7283537983894348),
 ('spectacular', 0.726314902305603),
 ('fabulous', 0.71151202917099),
 ('decent', 0.706972062587738)]

### Find the word which is not like others

In [None]:
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

### Saving the model

In [None]:
model.save('word2vec-movie-50')

In [None]:
!ls -l

total 141108
drwx------ 6 root root      4096 May 13 05:14 drive
drwxr-xr-x 1 root root      4096 May  9 13:41 sample_data
-rw-r--r-- 1 root root 132179650 May 13 05:14 test.csv
-rw-r--r-- 1 root root  12299752 May 13 05:17 word2vec-movie-50


In [None]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [None]:
# Perform word vector arithmetic and find most similar words
model.wv.most_similar(positive=['king', 'man'], negative=['queen'])

[('soldier', 0.581389844417572),
 ('vendetta', 0.5601274371147156),
 ('prophecy', 0.5521265268325806),
 ('scientist', 0.5456058979034424),
 ('warlord', 0.5410327315330505),
 ('convict', 0.536520779132843),
 ('nemesis', 0.5364158749580383),
 ('buio', 0.52924644947052),
 ('enforcer', 0.5249879360198975),
 ('creator', 0.5235925316810608)]

In [None]:
"women" in vocabulary

True

In [None]:
model.wv['woman']

array([ 3.7913132 ,  2.0220149 ,  2.7254148 , -2.280917  , -0.37915996,
        1.0900831 , -2.3531177 , -2.0287616 , -3.39528   ,  2.1805832 ,
        1.4421312 , -2.7971406 ,  1.6335782 , -0.39160797, -3.6215632 ,
        1.9877985 , -1.9293661 , -0.1581749 ,  0.3875743 , -0.6434101 ,
        5.3861284 ,  2.2711701 ,  3.1644058 ,  1.2677376 ,  1.063832  ,
       -2.0012224 , -1.9722824 ,  2.1697876 , -1.9483765 , -1.5401108 ,
        2.5437267 ,  0.03221386,  1.1909721 , -3.0002198 ,  0.24454121,
       -1.7930827 , -0.9781645 , -0.1584473 ,  1.4208074 , -0.84618175,
        1.0327382 , -2.5854375 , -2.939921  , -2.1078942 ,  1.1326108 ,
       -2.2290213 ,  0.05068598, -2.2298274 , -1.4539225 , -0.98942035],
      dtype=float32)

In [None]:
model.wv.most_similar(positive=['woman', 'prince'], negative=['man'])


[('princess', 0.7902963161468506),
 ('queen', 0.7166551947593689),
 ('widow', 0.7011138200759888),
 ('maria', 0.6982204914093018),
 ('mistress', 0.6855611205101013),
 ('belle', 0.6851502656936646),
 ('aunt', 0.6662737131118774),
 ('nurse', 0.663651704788208),
 ('marian', 0.6589152216911316),
 ('daughter', 0.656615138053894)]

The **`most_similar()`** function in Word2Vec works by performing **vector arithmetic** on word embeddings (word vectors) in a continuous vector space. Here's how it works in simple terms:

### 1. **Word Embeddings in Word2Vec**
Word2Vec represents each word in the vocabulary as a high-dimensional vector (embedding). These vectors capture semantic meanings and relationships between words. For example, similar words like "king" and "queen" will have vector representations that are close to each other in the vector space.

### 2. **Vector Arithmetic (Word Analogies)**
Once words are represented as vectors, you can perform arithmetic operations on these vectors, like addition and subtraction. This can reveal relationships between words.

Let’s break down how `most_similar(positive=['king', 'man'], negative=['queen'])` works:

### 3. **What Happens Internally?**

- **`positive=['king', 'man']`**: The vectors for "king" and "man" are added together. This operation roughly means "combine the semantic concepts of 'king' and 'man'."
  
- **`negative=['queen']`**: The vector for "queen" is subtracted. This operation essentially means "remove the semantic meaning of 'queen' from the combination."

The final equation is:

$[
\text{Resulting Vector} = \text{Vec('king')} + \text{Vec('man')} - \text{Vec('queen')}
]$

### 4. **Finding the Nearest Vector (Answer)**
Word2Vec then looks for the word whose vector is closest (in terms of cosine similarity) to the resulting vector from the above arithmetic. Cosine similarity measures the angle between two vectors: the smaller the angle, the more similar the vectors.

In this case, **Vec('king') - Vec('queen') + Vec('man')** results in a vector that is closest to **Vec('woman')**. Therefore, the model will return "woman" as the most similar word to this combination.

### 5. **Analogy Example:**
Let’s break down a famous analogy:
```python
model.wv.most_similar(positive=['king', 'woman'], negative=['man'])
```
Here’s the intuition:
- **`Vec('king') - Vec('man') + Vec('woman')`**: This equation takes the vector for "king," subtracts the "man" aspect, and adds "woman."
- What remains is a concept similar to "queen" (the female counterpart of "king"), so the model returns "queen."

### 6. **How the Model Learns These Relationships**
Word2Vec learns word embeddings by training on large corpora of text, where it tries to predict words based on their context (Skip-Gram) or predict context words from a target word (CBOW). Through this process, the model captures semantic and syntactic relationships between words.

For example:
- **Syntactic Relationships:** _"walking"_ is related to _"walk"_ as _"swimming"_ is related to _"swim"_.
- **Semantic Relationships:** _"Paris"_ is related to _"France"_ as _"Berlin"_ is related to _"Germany"_.

### Why This Works:
- Words with similar meanings or grammatical roles (like "queen" and "king") are often used in similar contexts, leading to similar vector representations.
- Arithmetic operations on vectors, like addition and subtraction, approximate the difference between these semantic concepts.
