Part 1: Using Pretrained Word2Vec Model

In [8]:
import gensim.downloader as api

# Load the pretrained word2vec-google-news-300 model
model = api.load("word2vec-google-news-300")



In [9]:
# Pick 5 words and find similar words
words = ["happy", "movie", "actor", "love", "sad"]
similar_words = {word: model.most_similar(word) for word in words}

# Display similar words
for word, similar in similar_words.items():
    print(f"Words similar to '{word}':")
    for similar_word, similarity in similar:
        print(f"  {similar_word}: {similarity}")
    print()

Words similar to 'happy':
  glad: 0.7408890724182129
  pleased: 0.6632170677185059
  ecstatic: 0.6626912355422974
  overjoyed: 0.6599286794662476
  thrilled: 0.6514049172401428
  satisfied: 0.6437949538230896
  proud: 0.636042058467865
  delighted: 0.627237856388092
  disappointed: 0.6269949674606323
  excited: 0.6247665286064148

Words similar to 'movie':
  film: 0.8676770329475403
  movies: 0.8013108372688293
  films: 0.7363011837005615
  moive: 0.6830360889434814
  Movie: 0.6693680286407471
  horror_flick: 0.6577848792076111
  sequel: 0.6577793955802917
  Guy_Ritchie_Revolver: 0.650975227355957
  romantic_comedy: 0.6413198709487915
  flick: 0.6321909427642822

Words similar to 'actor':
  actress: 0.7930010557174683
  Actor: 0.7446156740188599
  thesp: 0.6954971551895142
  thespian: 0.6651668548583984
  actors: 0.6519852876663208
  funnyman: 0.635244607925415
  comedian_Dom_DeLuise: 0.6245246529579163
  entertainer: 0.6184110641479492
  Shakespearean_actor: 0.6067742705345154
  Oscar

In [11]:
# Analogies
analogies = [
    ("king", "man", "woman"),  # Expected result: 'queen'
    ("Paris", "France", "Germany"),  # Expected result: 'Berlin'
    ("big", "bigger", "small"),  # Expected result: 'smaller'
]

for a, b, c in analogies:
    result = model.most_similar(positive=[a, c], negative=[b])
    print(f"{a} - {b} + {c} = {result[0][0]} (similarity: {result[0][1]})")

king - man + woman = queen (similarity: 0.7118193507194519)
Paris - France + Germany = Berlin (similarity: 0.7644002437591553)
big - bigger + small = large (similarity: 0.6242177486419678)


Part 2: Movie Review Sentiment Classifier Using Word2Vec


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
# Cleaning the data

import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove punctuation, special characters, and stopwords
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned_review'] = df['review'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [5]:
import numpy as np

#Train Custom Models using Skip Gram and CBoW Vectors

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


#Tokenize Reviews

tokenized_reviews = [review.split() for review in df['cleaned_review']]

# Split data
X_train, X_test, y_train, y_test = train_test_split(tokenized_reviews, df['sentiment'], test_size=0.2, random_state=42)

# Train Word2Vec models
sg_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4, sg=1)  # Skip-gram
cbow_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4, sg=0)  # CBOW

# Get average word vectors for each review
def get_average_word2vec(tokens_list, model, vector_size):
    # Remove out-of-vocabulary words
    tokens_list = [token for token in tokens_list if token in model.wv.index_to_key]
    if not tokens_list:
        return np.zeros(vector_size)
    return np.mean(model.wv[tokens_list], axis=0)

X_train_sg = [get_average_word2vec(review, sg_model, 100) for review in X_train]
X_test_sg = [get_average_word2vec(review, sg_model, 100) for review in X_test]

X_train_cbow = [get_average_word2vec(review, cbow_model, 100) for review in X_train]
X_test_cbow = [get_average_word2vec(review, cbow_model, 100) for review in X_test]

# Train logistic regression models
lr_sg = LogisticRegression(max_iter=1000)
lr_sg.fit(X_train_sg, y_train)
y_pred_sg = lr_sg.predict(X_test_sg)
accuracy_sg = accuracy_score(y_test, y_pred_sg)

lr_cbow = LogisticRegression(max_iter=1000)
lr_cbow.fit(X_train_cbow, y_train)
y_pred_cbow = lr_cbow.predict(X_test_cbow)
accuracy_cbow = accuracy_score(y_test, y_pred_cbow)

In [12]:
# Train Model Using Pretrained Word2Vec Vectors

import numpy as np

def get_average_pretrained_word2vec(tokens_list, model, vector_size):
    tokens_list = [token for token in tokens_list if token in model.key_to_index]
    if not tokens_list:
        return np.zeros(vector_size)
    return np.mean([model[token] for token in tokens_list], axis=0)

X_train_pretrained = [get_average_pretrained_word2vec(review, model, 300) for review in X_train]
X_test_pretrained = [get_average_pretrained_word2vec(review, model, 300) for review in X_test]

# Train logistic regression model
lr_pretrained = LogisticRegression(max_iter=1000)
lr_pretrained.fit(X_train_pretrained, y_train)
y_pred_pretrained = lr_pretrained.predict(X_test_pretrained)
accuracy_pretrained = accuracy_score(y_test, y_pred_pretrained)


In [13]:
# Report the metrics
metrics = {
    "Model": ["Skip-gram", "CBOW", "Pretrained Word2Vec"],
    "Accuracy": [accuracy_sg, accuracy_cbow, accuracy_pretrained]
}

metrics_df = pd.DataFrame(metrics)
print(metrics_df)


                 Model  Accuracy
0            Skip-gram    0.8679
1                 CBOW    0.8502
2  Pretrained Word2Vec    0.8500


Inference:

1. The Skip-gram model has the highest accuracy. This could mean that the Skip-gram model was able to generate more informative and discriminative word embeddings.

2. Despite being marginally less accurate, the CBOW model still demonstrates good performance, which means that it can also generate useful word embeddings.

3. The performance of the pretrained model indicates that while it provides a strong baseline, custom training on the specific dataset (as done with Skip-gram and CBOW) can yield better results.
