<a href="https://colab.research.google.com/github/anshupandey/Working_with_Large_Language_models/blob/main/WWL_C4_Word2vec_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploring pretrained Word2vec model: Glove

In [10]:
import gensim.downloader as api

# Load the GloVe model from Gensim-data repository
# Here we use 'glove-wiki-gigaword-50' as an example. Other dimensions/models are available as well.
glove_model = api.load('glove-wiki-gigaword-50')
word = 'python'
word_vector = glove_model[word]
print(word_vector.shape)

(50,)


In [11]:
glove_model['hello']

array([-0.38497 ,  0.80092 ,  0.064106, -0.28355 , -0.026759, -0.34532 ,
       -0.64253 , -0.11729 , -0.33257 ,  0.55243 , -0.087813,  0.9035  ,
        0.47102 ,  0.56657 ,  0.6985  , -0.35229 , -0.86542 ,  0.90573 ,
        0.03576 , -0.071705, -0.12327 ,  0.54923 ,  0.47005 ,  0.35572 ,
        1.2611  , -0.67581 , -0.94983 ,  0.68666 ,  0.3871  , -1.3492  ,
        0.63512 ,  0.46416 , -0.48814 ,  0.83827 , -0.9246  , -0.33722 ,
        0.53741 , -1.0616  , -0.081403, -0.67111 ,  0.30923 , -0.3923  ,
       -0.55002 , -0.68827 ,  0.58049 , -0.11626 ,  0.013139, -0.57654 ,
        0.048833,  0.67204 ], dtype=float32)

In [12]:
glove_model['hello'].shape

(50,)

In [13]:
v1 = glove_model["king"]
v2 = glove_model['emperor']
v3 = glove_model['table']

In [14]:
# prompt: python code to calculate similarity between two vectors
import numpy as np
similarity = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
print(similarity)


0.7736248


In [15]:
similarity = np.dot(v1, v3) / (np.linalg.norm(v1) * np.linalg.norm(v3))
print(similarity)

0.2848273


In [16]:
glove_model.most_similar("stocks")

[('stock', 0.8653818368911743),
 ('markets', 0.8522835969924927),
 ('prices', 0.8431004285812378),
 ('market', 0.8400351405143738),
 ('traders', 0.8257467150688171),
 ('trading', 0.8112872838973999),
 ('investors', 0.8083530068397522),
 ('indexes', 0.7902355194091797),
 ('dealers', 0.7884277701377869),
 ('shares', 0.7868536114692688)]

In [17]:
glove_model.most_similar("trading")

[('stock', 0.9012669920921326),
 ('exchange', 0.898104190826416),
 ('futures', 0.8487032651901245),
 ('trades', 0.8236047029495239),
 ('traded', 0.8166490793228149),
 ('stocks', 0.8112873435020447),
 ('market', 0.8051413893699646),
 ('prices', 0.7966799139976501),
 ('closing', 0.7950035929679871),
 ('closed', 0.7914804220199585)]

In [18]:
glove_model.most_similar("amazing")

[('incredible', 0.9189565181732178),
 ('fantastic', 0.8799790143966675),
 ('awesome', 0.8620665669441223),
 ('wonderful', 0.8537988662719727),
 ('terrific', 0.8482187390327454),
 ('marvelous', 0.8439217805862427),
 ('astonishing', 0.8103041052818298),
 ('remarkable', 0.8091045022010803),
 ('exciting', 0.79411780834198),
 ('unbelievable', 0.7916541695594788)]

## Text Classification: Sentiment analysis

In [19]:
import pandas as pd

In [20]:
# load dataset
url = "https://raw.githubusercontent.com/anshupandey/Working_with_Large_Language_models/main/sentimentdata.csv"
df = pd.read_csv(url)
df.shape

(20, 2)

In [21]:
x = df['document']
y = df['label']

### Vectorization using pretrained word2vec

In [22]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
stop_words = set(stopwords.words('english'))

In [24]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

def get_sent_to_vec(sent):
    sent_tokens = preprocess_text(sent)
    sent_vec = np.zeros(50)
    for word in sent_tokens:
        if word in glove_model:
            sent_vec += glove_model[word]
    return sent_vec / len(sent_tokens) if len(sent_tokens) > 0 else sent_vec


In [37]:
# Prepare data
x = df['document']
y = df['label']
xvec = np.array([get_sent_to_vec(sent) for sent in x])

### Sentiment Analysis with ML

In [38]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(max_iter=9000,random_state=5,hidden_layer_sizes=(100,100))
# train the model with vectorized data and labels
model.fit(xvec,y)

In [39]:
ip = "Noodles are good"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Positive'], dtype='<U8')

In [40]:
ip = "Noodles are bad"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Negative'], dtype='<U8')

In [41]:
ip = "Noodles are not good"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Positive'], dtype='<U8')

In [42]:
ip = "Noodles are not bad"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Negative'], dtype='<U8')

In [43]:
ip = "Noodles are not poor"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Negative'], dtype='<U8')

In [44]:
ip = "Noodles are amazing"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Positive'], dtype='<U8')

In [88]:
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Sample data
texts = x.values
labels = y.values

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stop_words = list(ENGLISH_STOP_WORDS)
stop_words.remove("not")


# Compute TF-IDF weights
vec = TfidfVectorizer(lowercase=True,stop_words=stop_words)
xtfidf = vec.fit(texts)
tfidf_feature_names = vec.get_feature_names_out()

# Create a dictionary mapping from words to their TF-IDF weights
xwv = [glove_model[word] for word in tfidf_feature_names]

In [89]:
def get_sent_to_vec(sent):
    senttf = vec.transform([sent]).toarray().reshape(-1,1)
    svec = (senttf*xwv).mean(axis=0)
    return svec

In [114]:
xvec = np.array([get_sent_to_vec(sent) for sent in x])

model = LogisticRegression()
model.fit(xvec,y)

In [115]:
ip = "Noodles are good"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Positive'], dtype=object)

In [116]:
ip = "Noodles are bad"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Negative'], dtype=object)

In [117]:
ip = "Noodles are not good"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Negative'], dtype=object)

In [118]:
ip = "Noodles are not bad"
ipvec = get_sent_to_vec(ip)
ipvec = ipvec.reshape(1,-1)
model.predict(ipvec)

array(['Negative'], dtype=object)