## Word2vec using Gensim

**Author: Abhishek Dey**

## Installation

In [1]:
!pip3 install gensim

Defaulting to user installation because normal site-packages is not writeable
    extract-msg (<=0.29.*)
                 ~~~~~~~^[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


## Documents

In [2]:
d1="Apple and oranges are fruits"
d2="iphone is a product of Apple"
d3="Lion is roaming in the Jungle"
d4="Where is the Tiger ?"
d5="Samsung mobiles are popular in China"
d6="India is one of the biggest exporter of Cocunut"

## Corpus

In [3]:
corpus = [d1,d2,d3,d4,d5,d6]

In [4]:
corpus

['Apple and oranges are fruits',
 'iphone is a product of Apple',
 'Lion is roaming in the Jungle',
 'Where is the Tiger ?',
 'Samsung mobiles are popular in China',
 'India is one of the biggest exporter of Cocunut']

## Text-preprocessing

In [5]:
from textacy import preprocessing as tp

In [6]:
def text_preprocessing(text):
    
    text = tp.remove.punctuation(text)
    text = tp.normalize.whitespace(text)
    text = text.lower()
    
    return text

In [7]:
filtered_corpus = [text_preprocessing(doc) for doc in corpus]

In [8]:
for doc in filtered_corpus:
    print(doc)

apple and oranges are fruits
iphone is a product of apple
lion is roaming in the jungle
where is the tiger
samsung mobiles are popular in china
india is one of the biggest exporter of cocunut


## Tokenization

In [9]:
from nltk import word_tokenize

In [15]:
tokenized_corpus = [word_tokenize(doc) for doc in filtered_corpus]

In [16]:
for doc in tokenized_corpus:
    print(doc)

['apple', 'and', 'oranges', 'are', 'fruits']
['iphone', 'is', 'a', 'product', 'of', 'apple']
['lion', 'is', 'roaming', 'in', 'the', 'jungle']
['where', 'is', 'the', 'tiger']
['samsung', 'mobiles', 'are', 'popular', 'in', 'china']
['india', 'is', 'one', 'of', 'the', 'biggest', 'exporter', 'of', 'cocunut']


## Word2vec

In [17]:
import gensim
from gensim.models import Word2Vec

## Train a Word2Vec model on the tokenized corpus

In [18]:
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

## Get the word vector for 'samsung'

In [19]:
vector1 = model.wv['samsung']

In [20]:
vector1

array([-4.9735666e-03, -1.2833046e-03,  3.2806373e-03, -6.4140330e-03,
       -9.7015891e-03, -9.2602335e-03,  9.0206973e-03,  5.3716935e-03,
       -4.7882269e-03, -8.3296420e-03,  1.2939513e-03,  2.8780627e-03,
       -1.2452841e-03,  1.2708711e-03, -4.3213032e-03,  4.7913659e-03,
        1.4751852e-03,  8.8778250e-03, -9.9765137e-03, -5.2695703e-03,
       -9.1028418e-03, -3.4791828e-04, -7.8573059e-03,  5.0312411e-03,
       -6.3968552e-03, -5.9528374e-03,  5.0709103e-03, -8.1597688e-03,
        1.4552021e-03, -7.2395410e-03,  9.8624220e-03,  8.6337589e-03,
        1.7689526e-03,  5.7885037e-03,  4.5962157e-03, -5.9917830e-03,
        9.7569469e-03, -9.6822055e-03,  8.0492580e-03,  2.7563798e-03,
       -3.0551220e-03, -3.5618627e-03,  9.0719536e-03, -5.4409085e-03,
        8.1868721e-03, -6.0088872e-03,  8.3913766e-03, -5.5549381e-04,
        7.9425992e-03, -3.1549716e-03,  5.9792148e-03,  8.8043455e-03,
        2.5438380e-03,  1.3177490e-03,  5.0391923e-03,  8.0025224e-03,
      

In [21]:
len(vector1)

100

## Get the word vector for 'jungle'

In [22]:
vector2 = model.wv["jungle"]

In [23]:
vector2

array([ 1.3325238e-03,  6.5408563e-03,  9.9846032e-03,  9.0624550e-03,
       -8.0153607e-03,  6.4913859e-03, -5.7147373e-03, -9.7157480e-04,
        4.8282265e-04,  6.5819337e-03,  4.4701435e-03,  4.6037268e-03,
        9.4831241e-03,  3.8266421e-04, -6.0387133e-03, -6.3301004e-03,
        6.4317896e-03, -5.2425815e-03, -2.8498126e-03,  4.0752841e-03,
       -2.2902358e-03, -6.0252128e-03, -2.3236549e-03,  1.2069190e-03,
        2.1833598e-03,  6.0837734e-03, -5.2140011e-03,  3.0779613e-03,
        7.2406759e-03,  2.1951138e-03,  5.3974902e-03, -4.8453333e-03,
        6.1526122e-03, -7.6012816e-03,  3.4928655e-03, -9.3218042e-03,
       -2.6043104e-03, -9.0731988e-03, -1.5882683e-03, -5.3647519e-03,
       -3.9439187e-03,  1.1536527e-03,  2.8003477e-03, -1.5263951e-03,
       -8.1705153e-03, -5.9180222e-03,  8.1929564e-04, -3.9462578e-03,
       -9.4304476e-03, -7.7497482e-04,  6.6332687e-03,  5.9788441e-03,
       -9.9172592e-03,  3.1185830e-03, -5.9873010e-03, -9.1818906e-03,
      

## Get words similar to 'india'

In [25]:
similar_words = model.wv.most_similar('india', topn=3)

print(similar_words)

[('is', 0.21883949637413025), ('mobiles', 0.1747463196516037), ('popular', 0.16378773748874664)]


## Note

* The similar words are inaccurate

* This is because word2vec requires large amount of data to learn meaningful word relationships