[All Resources for NLP](https://drive.google.com/drive/folders/1gAtmKXtB59sjqTuDl_Xj8S9Sp9DdjUH5?usp=sharing)


In [1]:
!pip install tensorflow_hub



In [2]:
!pip install tensorflow



In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
import numpy as np
import pandas as pd

import tensorflow_hub as hub


In [4]:
embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")
#https://tfhub.dev/google/Wiki-words-250/2

In [5]:
embed(["king"])

<tf.Tensor: shape=(1, 250), dtype=float32, numpy=
array([[-0.1343142 , -0.13684654,  0.07431408, -0.10650009, -0.02859968,
         0.04424024, -0.03567196, -0.06899981,  0.01616227,  0.07217974,
        -0.01046093, -0.01912977,  0.05004472,  0.08152357,  0.03152218,
        -0.02048703,  0.05789423,  0.01240441, -0.05209014,  0.10748381,
        -0.00459293, -0.02017592, -0.04480452, -0.03096171, -0.04350654,
         0.00444236,  0.00460669, -0.10705121, -0.14162003, -0.01844652,
        -0.02916938,  0.03193855,  0.05675149, -0.0682263 , -0.04352165,
         0.03420302,  0.0014838 ,  0.04176674, -0.07689109,  0.0669754 ,
        -0.07601002, -0.07946104, -0.11850064,  0.08176526,  0.10359441,
        -0.0693299 , -0.0799439 , -0.08720363, -0.02146907, -0.01452966,
        -0.05733249, -0.00535188, -0.07032717, -0.05828672,  0.02124319,
         0.00817431,  0.12562938, -0.17722268, -0.01333303,  0.03352974,
         0.07383418,  0.08609763,  0.01466759,  0.05075613,  0.028285  ,
 

In [6]:
embed(["queen"])

<tf.Tensor: shape=(1, 250), dtype=float32, numpy=
array([[-0.06867649, -0.14200377,  0.02866129, -0.05169869, -0.04283558,
         0.03471284,  0.00563643, -0.07633342, -0.04744494, -0.00961031,
         0.05438118, -0.01875446,  0.04598984,  0.11078782,  0.06182237,
        -0.07148245,  0.08874689,  0.06116664, -0.04689367,  0.05333443,
         0.02776343, -0.0017771 , -0.00322503, -0.03645062, -0.03717954,
         0.03179449,  0.01319127, -0.02359019, -0.10966538,  0.03994623,
         0.06102228,  0.0508691 ,  0.10350815, -0.12094358, -0.01933583,
        -0.03739669,  0.00683476,  0.02680918, -0.00433959,  0.02936172,
        -0.07294036, -0.09880833, -0.1596977 ,  0.01566896,  0.10767547,
        -0.0835154 , -0.13671345, -0.03192026, -0.02783204, -0.04368471,
        -0.00748212,  0.04610368, -0.0745914 , -0.02966405,  0.00425638,
        -0.02484322,  0.08340865, -0.16547503,  0.03614734,  0.08827798,
        -0.01716653,  0.03805317, -0.06583317,  0.02210291,  0.01156285,
 

In [7]:
movie_reviews_train = [
         {'review': 'this is the best movie', 'sentiment': 'positive'},
         {'review': 'i recommend you watch this movie', 'sentiment': 'positive'},
         {'review': 'it was waste of money and time', 'sentiment': 'negative'},
         {'review': 'the worst movie ever', 'sentiment': 'negative'}
    ]
df = pd.DataFrame(movie_reviews_train)
df.head()

Unnamed: 0,review,sentiment
0,this is the best movie,positive
1,i recommend you watch this movie,positive
2,it was waste of money and time,negative
3,the worst movie ever,negative


In [8]:
def get_max_length(df):
    """
    get max token counts from train data,
    so we use this number as fixed length input to RNN cell
    """
    max_length = 0
    for row in df['review']:
        if len(row.split(" ")) > max_length:
            max_length = len(row.split(" "))
    return max_length

In [9]:
# max_length is used for max sequence of input
max_length = get_max_length(df)
print(max_length)

7


In [10]:
def get_max_length(df):
    """
    get max token counts from train data,
    so we use this number as fixed length input to RNN cell
    """
    max_length = 0
    for row in df['review']:
        if len(row.split(" ")) > max_length:
            max_length = len(row.split(" "))
    return max_length

def get_word2vec_enc(reviews):
    """
    get word2vec value for each word in sentence.
    concatenate word in numpy array, so we can use it as RNN input
    """
    encoded_reviews = []
    for review in reviews:
        tokens = review.split(" ")
        word2vec_embedding = embed(tokens)
        encoded_reviews.append(word2vec_embedding)
    return encoded_reviews

def get_padded_encoded_reviews(encoded_reviews):
    """
    for short sentences, we prepend zero padding so all input to RNN has same length
    """
    padded_reviews_encoding = []
    for enc_review in encoded_reviews:
        zero_padding_cnt = max_length - enc_review.shape[0]
        pad = np.zeros((1, 250))
        for i in range(zero_padding_cnt):
            enc_review = np.concatenate((pad, enc_review), axis=0)
        padded_reviews_encoding.append(enc_review)
    return padded_reviews_encoding

def sentiment_encode(sentiment):
    """
    return one hot encoding for Y value
    """
    if sentiment == 'positive':
        return [1,0]
    else:
        return [0,1]

def preprocess(df):
    """
    encode text value to numeric value
    """
    # encode words into word2vec
    reviews = df['review'].tolist()

    encoded_reviews = get_word2vec_enc(reviews)
    padded_encoded_reviews = get_padded_encoded_reviews(encoded_reviews)
    # encoded sentiment
    sentiments = df['sentiment'].tolist()
    encoded_sentiment = [sentiment_encode(sentiment) for sentiment in sentiments]
    X = np.array(padded_encoded_reviews)
    Y = np.array(encoded_sentiment)
    return X, Y

In [11]:
train_X, train_Y = preprocess(df)

In [12]:
# LSTM model
model = Sequential()
model.add(LSTM(32))
model.add(Dense(2, activation='softmax'))

In [13]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [14]:
print('Train...')
model.fit(train_X, train_Y,epochs=50)

Train...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x78a3387e21d0>

In [15]:
"""
movie_reviews_train = [
         {'review': 'this is the best movie', 'sentiment': 'positive'},
         {'review': 'i recommend you watch this movie', 'sentiment': 'positive'},
         {'review': 'it was waste of money and time', 'sentiment': 'negative'},
         {'review': 'the worst movie ever', 'sentiment': 'negative'}
    ]
"""
movie_reviews_test = [
         {'review': 'it is not so good movie', 'sentiment': 'positive'},
         {'review': 'i suggest you see this movie', 'sentiment': 'positive'},
         {'review': 'it was just throwing 20 dollars away', 'sentiment': 'negative'},
         {'review': 'worse than any show', 'sentiment': 'negative'},
         {'review': 'nice movie, so love it', 'sentiment': 'positive'},
         {'review': 'It was so scaring', 'sentiment': 'negative'}
    ]
test_df = pd.DataFrame(movie_reviews_test)

test_X, test_Y = preprocess(test_df)

score, acc = model.evaluate(test_X, test_Y, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

1/1 - 1s - loss: 0.1548 - accuracy: 1.0000 - 586ms/epoch - 586ms/step
Test score: 0.15483935177326202
Test accuracy: 1.0


## ***TEXT VECTORIZATION Example using CBOW and Skip-Gram models***

In [16]:
from gensim.models import Word2Vec

# Sample tokenized sentences
sentences = [
    ["i", "love", "machine", "learning"],
    ["deep", "learning", "is", "fascinating"],
    ["word2vec", "is", "a", "technique", "in", "natural", "language", "processing"],
    ["building", "the", "deep", "learning", "models", "is", "easy"]
    # Add more sentences as needed
]

# Build CBOW Word2Vec model (sg parameter is set to 0)
cbow_model = Word2Vec(sentences, vector_size=100, window=5, sg=0, min_count=1)

# Build Skip-gram Word2Vec model
skipgram_model = Word2Vec(sentences, vector_size=100, window=5, sg=1, min_count=1)

# Example word for similarity comparison
target_word = "deep"

# Get similar words using CBOW model
similar_words_cbow = cbow_model.wv.most_similar(target_word, topn=5)
print(f"Similar words to '{target_word}' (CBOW): {similar_words_cbow}")

# Get similar words using Skip-gram model
similar_words_skipgram = skipgram_model.wv.most_similar(target_word, topn=5)
print(f"Similar words to '{target_word}' (Skip-gram): {similar_words_skipgram}")


Similar words to 'deep' (CBOW): [('word2vec', 0.19911722838878632), ('in', 0.17272016406059265), ('models', 0.17022426426410675), ('machine', 0.14595220983028412), ('love', 0.06408823281526566)]
Similar words to 'deep' (Skip-gram): [('word2vec', 0.1991163194179535), ('in', 0.17271742224693298), ('models', 0.17024536430835724), ('machine', 0.14595457911491394), ('love', 0.06408510357141495)]


In [17]:
### To get word vector of any particular word
A = cbow_model.wv["deep"]

In [18]:
B = cbow_model.wv["word2vec"]

In [19]:
import numpy as np
from numpy.linalg import norm
print(np.dot(A,B)/(norm(A)*norm(B)))

0.19911724


### ***TEXT VECTORIZATION Example- using Pre-trained word vectors***
[GloVe [Global Vectors for Word Representation] word Vectors from Stanford NLP Group](https://nlp.stanford.edu/projects/glove/)

[FastText Word Embeddings](https://https://fasttext.cc/docs/en/crawl-vectors.html)

[Blog on Word Embeddings](https://medium.com/@hari4om/word-embedding-d816f643140)


In [20]:
from google.colab import drive

### You can get this file from the link: https://nlp.stanford.edu/projects/glove/glove.6B.zip
### Unzip it and upload any of the 50/ 100/ 200/ 300 dimensional .txt file on your drive
# Note the file path to point to your downloaded/ uploaded GloVe file

drive.mount('/content/drive')
glove_file_path = '/content/drive/MyDrive/rep_Python/NLP/glove.6B.50d.txt'   ### For COLAB

# glove_file_path = 'D:/......../glove.6B.50d.txt'   ### For Jupyter Notebook


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# Load pre-trained word vectors (GloVe)
def load_word_vectors(file_path):
    word_vectors = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = list(map(float, values[1:]))
            word_vectors[word] = vector
    return word_vectors

In [22]:
### You can get this file from the link: https://nlp.stanford.edu/projects/glove/glove.6B.zip
### Unzip it and upload any of the 50/ 100/ 200/ 300 dimensional .txt file on your drive
# Adjust the file path to point to your downloaded GloVe file

word_vectors = load_word_vectors(glove_file_path)


In [23]:
# Check the size of the loaded word vectors
print("Number of word vectors:", len(word_vectors))
print("Vector dimensionality:", len(word_vectors['word']))


Number of word vectors: 400000
Vector dimensionality: 50


In [24]:
# Example usage: Getting the vector for a specific word
word = "example"
if word in word_vectors:
    vector = word_vectors[word]
    print(f"Vector for '{word}': {vector}")
else:
    print(f"No vector found for '{word}'")


Vector for 'example': [0.51564, 0.56912, -0.19759, 0.0080456, 0.41697, 0.59502, -0.053312, -0.83222, -0.21715, 0.31045, 0.09352, 0.35323, 0.28151, -0.35308, 0.23496, 0.04429, 0.017109, 0.0063749, -0.01662, -0.69576, 0.019819, -0.52746, -0.14011, 0.21962, 0.13692, -1.2683, -0.89416, -0.1831, 0.23343, -0.058254, 3.2481, -0.48794, -0.01207, -0.81645, 0.21182, -0.17837, -0.02874, 0.099358, -0.14944, 0.2601, 0.18919, 0.15022, 0.18278, 0.50052, -0.025532, 0.24671, 0.10596, 0.13612, 0.0090427, 0.39962]
