[All Resources for NLP](https://drive.google.com/drive/folders/1gAtmKXtB59sjqTuDl_Xj8S9Sp9DdjUH5?usp=sharing)


In [1]:
!pip install tensorflow_hub



In [2]:
!pip install tensorflow



## ***TEXT VECTORIZATION Example using CBOW and Skip-Gram models***

In [3]:
from gensim.models import Word2Vec

# Sample tokenized sentences
sentences = [
    ["i", "love", "machine", "learning"],
    ["deep", "learning", "is", "fascinating"],
    ["word2vec", "is", "a", "technique", "in", "natural", "language", "processing"],
    ["building", "the", "deep", "learning", "models", "is", "easy"]
    # Add more sentences as needed
]

# Build CBOW Word2Vec model (sg parameter is set to 0)
# cbow_model = Word2Vec(sentences, vector_size=100, window=2, sg=0, min_count=1)

# Build Skip-gram Word2Vec model
skipgram_model = Word2Vec(sentences, vector_size=10, window=2, sg=1, min_count=2)



In [4]:
# Example word for similarity comparison
target_word = "learning"

# # Get similar words using CBOW model
# similar_words_cbow = cbow_model.wv.most_similar(target_word, topn=2)
# print(f"Similar words to '{target_word}' (CBOW): {similar_words_cbow}")

# # Get similar words using Skip-gram model
similar_words_skipgram = skipgram_model.wv.most_similar(target_word, topn=5)
print(f"Similar words to '{target_word}' (Skip-gram): {similar_words_skipgram}")


Similar words to 'learning' (Skip-gram): [('is', 0.5436006188392639), ('deep', 0.32937225699424744)]


In [6]:
### To get word vector of any particular word
A = skipgram_model.wv["learning"]
A.shape

(10,)

In [7]:
A

array([ 0.07380505, -0.01533471, -0.04536613,  0.06554051, -0.0486016 ,
       -0.01816018,  0.0287658 ,  0.00991874, -0.08285215, -0.09448818],
      dtype=float32)

In [10]:
B = skipgram_model.wv["deep"]
B.shape

(10,)

In [11]:
import numpy as np
from numpy.linalg import norm
print(np.dot(A,B)/(norm(A)*norm(B)))

0.32937223


### Loading TOKEN BASED TEXT EMBEDDING trained on English Wikipedia Corpus

In [12]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")
#https://tfhub.dev/google/Wiki-words-250/2

In [13]:
v1 = embed(["king"])
v1.shape
# v1[0].shape

TensorShape([1, 250])

In [14]:
v2 = embed(["cat"])
v2.shape

TensorShape([1, 250])

In [15]:
# from numpy import dot
# from numpy.linalg import norm
# cos_sim = np.dot(v1, v2.T)/(norm(v1)*norm(v2))

from scipy import spatial
result = 1 - spatial.distance.cosine(v1[0], v2[0])

result

0.020226946100592613

### ***TEXT VECTORIZATION Example- using Pre-trained word vectors***
[GloVe [Global Vectors for Word Representation] word Vectors from Stanford NLP Group](https://nlp.stanford.edu/projects/glove/)

[FastText Word Embeddings](https://https://fasttext.cc/docs/en/crawl-vectors.html)

[Blog on Word Embeddings](https://medium.com/@hari4om/word-embedding-d816f643140)


In [16]:
from google.colab import drive

### You can get this file from the link: https://nlp.stanford.edu/projects/glove/glove.6B.zip
### Unzip it and upload any of the 50/ 100/ 200/ 300 dimensional .txt file on your drive
# Note the file path to point to your downloaded/ uploaded GloVe file

drive.mount('/content/drive')
glove_file_path = '/content/drive/MyDrive/rep_Python/NLP/glove.6B.50d.txt'   ### For COLAB

# glove_file_path = 'D:/......../glove.6B.50d.txt'   ### For Jupyter Notebook


Mounted at /content/drive


In [18]:
# Load pre-trained word vectors (GloVe)
def load_word_vectors(file_path):
    word_vectors = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = list(map(float, values[1:]))
            word_vectors[word] = vector
    return word_vectors

In [19]:
### You can get this file from the link: https://nlp.stanford.edu/projects/glove/glove.6B.zip
### Unzip it and upload any of the 50/ 100/ 200/ 300 dimensional .txt file on your drive
# Adjust the file path to point to your downloaded GloVe file

word_vectors = load_word_vectors(glove_file_path)


In [20]:
# Check the size of the loaded word vectors
print("Number of word vectors:", len(word_vectors))
print("Vector dimensionality:", len(word_vectors['word']))


Number of word vectors: 400000
Vector dimensionality: 50


In [21]:
type(word_vectors['happy'])

list

In [22]:
# Example usage: Getting the vector for a specific word
word = "example"
if word in word_vectors:
    vector = word_vectors[word]
    print(f"Vector for '{word}': {vector}")
else:
    print(f"No vector found for '{word}'")


Vector for 'example': [0.51564, 0.56912, -0.19759, 0.0080456, 0.41697, 0.59502, -0.053312, -0.83222, -0.21715, 0.31045, 0.09352, 0.35323, 0.28151, -0.35308, 0.23496, 0.04429, 0.017109, 0.0063749, -0.01662, -0.69576, 0.019819, -0.52746, -0.14011, 0.21962, 0.13692, -1.2683, -0.89416, -0.1831, 0.23343, -0.058254, 3.2481, -0.48794, -0.01207, -0.81645, 0.21182, -0.17837, -0.02874, 0.099358, -0.14944, 0.2601, 0.18919, 0.15022, 0.18278, 0.50052, -0.025532, 0.24671, 0.10596, 0.13612, 0.0090427, 0.39962]
