<a href="https://colab.research.google.com/github/arutraj/ML_Basics/blob/main/17_9_Word_Embeddings_in_Action_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Embeddings in Action - Word2Vec

Steps to follow:

1. Get data
2. Clean text data
3. Tokenization
4. Prepare vocabulary
5. Download pre-trained embeddings
6. Get word vectors

In [1]:
# import required libraries
import numpy as np
import re

# 1. Get Data

In [2]:
#input text
text=['Building some bots for Wikipedia.',
      'Wikipedia is flooded with information!@#.',
      '&*^%There is an app for everthing.']

# 2. Text Cleaning

In [4]:
# cleaning
import re

def clean(text):
  #lower case
  text=text.lower()

  #remove punctuations
  text=re.sub('[^a-zA-Z]'," ",text)

  return text

In [5]:
#call the clean function
cleaned_text=[]

for i in text:
  cleaned_text.append(clean(i))

In [6]:
cleaned_text

['building some bots for wikipedia ',
 'wikipedia is flooded with information ',
 'there is an app for everthing ']

# 3. Tokenization

In [7]:
#tokenize the text
tokens=[]

for i in cleaned_text:
  tokens.append(i.split())

print(tokens)

[['building', 'some', 'bots', 'for', 'wikipedia'], ['wikipedia', 'is', 'flooded', 'with', 'information'], ['there', 'is', 'an', 'app', 'for', 'everthing']]


# 4. Vocabulary Preparation

In [8]:
#construct vocabulary
vocab=[]

for i in tokens:
  for j in i:
    if j not in vocab:
      vocab.append(j)

#remove duplicate token
vocab = list(set(vocab))

print(vocab)

['an', 'for', 'app', 'everthing', 'some', 'information', 'building', 'wikipedia', 'flooded', 'with', 'bots', 'is', 'there']


#5. Feature Representation (word2vec)

### Download Google's pre-trained Word2Vec


In [18]:
# download and extract word2vec embeddings
#! wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
! wget -c "https://radimrehurek.com/gensim/models/word2vec.html"
#! wget -c "https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/view?resourcekey=0-wjGZdNAUop6WykTtMip30g"
#! gunzip GoogleNews-vectors-negative300.bin.gz


--2024-06-19 21:38:27--  https://radimrehurek.com/gensim/models/word2vec.html
Resolving radimrehurek.com (radimrehurek.com)... 188.114.97.0, 188.114.96.0, 2a06:98c1:3121::, ...
Connecting to radimrehurek.com (radimrehurek.com)|188.114.97.0|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘word2vec.html’

word2vec.html           [ <=>                ] 133.55K  --.-KB/s    in 0.01s   

2024-06-19 21:38:27 (10.3 MB/s) - ‘word2vec.html’ saved [136752]



In [19]:
from gensim.models import KeyedVectors

# path of the downloaded model
#filename = 'GoogleNews-vectors-negative300.bin'
filename = 'word2vec.html'

# load into gensim
w2vec = KeyedVectors.load_word2vec_format(filename, binary=True)

ValueError: not enough values to unpack (expected 2, got 0)

Once you have executed the above code, your word2vec embeddings are finally installed and loaded.



<br>

Please note that the length of every vector of the pre-trained word2vec embeddings is 300.


In [None]:
# empty array of shape (no. of tokens X 300) to store word2vec features
wordvec_array = np.zeros((len(vocab), 300))

for i,j in enumerate(vocab):
  wordvec_array[i,:] = w2vec.wv.word_vec(j)

In [None]:
wordvec_array

array([[ 0.12597656,  0.19042969,  0.06982422, ...,  0.0612793 ,
         0.17285156, -0.07861328],
       [ 0.11572266, -0.29101562, -0.30664062, ..., -0.24609375,
        -0.17773438,  0.16113281],
       [ 0.41992188,  0.12011719, -0.06787109, ...,  0.00836182,
        -0.25976562,  0.0279541 ],
       ...,
       [ 0.09423828, -0.02282715,  0.05224609, ..., -0.046875  ,
         0.16113281, -0.19921875],
       [ 0.21875   , -0.12207031, -0.00296021, ..., -0.35351562,
        -0.25195312, -0.11621094],
       [-0.02490234,  0.02197266, -0.03540039, ...,  0.01080322,
        -0.01879883, -0.06884766]])