# Part 1

- Find similar words in the given text using the Word2Vec model.
- Come up with examples like "king - man + woman" = "queen" for word vectors.

In [None]:
# Install gensim and download the pretrained Google News word2vec model
import gensim.downloader as api
model = api.load('word2vec-google-news-300')
print("Model loaded successfully.")



## Similar Words

In [8]:
# Pick 5 words and find similar words for each
words = ['Naan', 'Kshitij', 'horizon', 'anime', 'Minecraft']
for word in words:
    print(f"Similar words to '{word}':")
    try:
        similar = model.most_similar(word, topn=5)
        for sim_word, score in similar:
            print(f"  {sim_word} ({score:.3f})")
    except KeyError:
        print(f"  '{word}' not found in vocabulary.")

Similar words to 'Naan':
  Paneer (0.697)
  Kadhai (0.677)
  Idli (0.665)
  Paratha (0.662)
  Chaat (0.660)
Similar words to 'Kshitij':
  Paneer (0.697)
  Kadhai (0.677)
  Idli (0.665)
  Paratha (0.662)
  Chaat (0.660)
Similar words to 'Kshitij':
  Gaurav (0.686)
  Nikhil (0.679)
  Pranay (0.669)
  Harshad (0.669)
  Rohit (0.668)
Similar words to 'horizon':
  Gaurav (0.686)
  Nikhil (0.679)
  Pranay (0.669)
  Harshad (0.669)
  Rohit (0.668)
Similar words to 'horizon':
  looming (0.514)
  Fireballs_lit (0.514)
  distant_speck (0.494)
  looms_ominously (0.470)
  clouds (0.468)
Similar words to 'anime':
  looming (0.514)
  Fireballs_lit (0.514)
  distant_speck (0.494)
  looms_ominously (0.470)
  clouds (0.468)
Similar words to 'anime':
  manga (0.809)
  animé (0.758)
  Anime (0.754)
  anime_manga (0.715)
  animes (0.702)
Similar words to 'Minecraft':
  manga (0.809)
  animé (0.758)
  Anime (0.754)
  anime_manga (0.715)
  animes (0.702)
Similar words to 'Minecraft':
  Mojang (0.713)
  Mark

## Word Vector Arithmetic

In [25]:
experiments = [
    (['Japan', 'Pasta'], ['Italy']),
    (['Sword', 'Paper'], ['Pen']),
    (['cat', 'beak'], ['bird'])
 ]
for positive, negative in experiments:
    try:
        result = model.most_similar(positive=positive, negative=negative, topn=3)
        print(f"Result for {positive} - {negative}:")
        for word, score in result:
            print(f"  {word} ({score:.3f})")
    except KeyError as e:
        print(f"  Word not found in vocabulary: {e}")
    print()

Result for ['Japan', 'Pasta'] - ['Italy']:
  Sushi (0.584)
  Teriyaki (0.575)
  Tofu (0.529)

Result for ['Sword', 'Paper'] - ['Pen']:
  Knight_Chronicles (0.424)
  Steel (0.401)
  Mage (0.383)

Result for ['cat', 'beak'] - ['bird']:
  paws (0.567)
  claws (0.541)
  hind_paws (0.534)



# Part 2

- Build a movie review sentiment classifier using WordVectors

In [None]:
# Imports
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec, FastText

In [26]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\kshit\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1


In [None]:
# Load IMDB dataset (CSV file assumed in downloaded path)
data_path = os.path.join(path, 'IMDB Dataset.csv')
df = pd.read_csv(data_path)

print('Shape:', df.shape)
print('Columns:', df.columns.tolist())
print(df.head())
print('Label distribution:')
print(df['sentiment'].value_counts())

Shape: (50000, 2)
Columns: ['review', 'sentiment']
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Label distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


## Text Cleaning

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_review'] = df['review'].apply(clean_text)
print(df[['review', 'clean_review']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kshit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                        clean_review  
0  one reviewers mentioned watching oz episode yo...  
1  wonderful little production br br filming tech...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  


## Embedding retrieval

In [None]:
def get_review_vector(review, model):
    words = review.split()
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X = np.array([get_review_vector(text, model) for text in tqdm(df['clean_review'])])
y = (df['sentiment'] == 'positive').astype(int)

print('Feature shape:', X.shape)
print('Labels shape:', y.shape)

100%|██████████| 50000/50000 [00:08<00:00, 6036.38it/s]



Feature shape: (50000, 300)
Labels shape: (50000,)


## Pretrained Word2Vec Model - Logistic Regression

In [36]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8515
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



## Skipgram, CBOW, FastText

In [None]:
# Prepare tokenized reviews
tokenized_reviews = [text.split() for text in df['clean_review']]

# Train Skip-gram model
w2v_skipgram = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, sg=1, min_count=2, workers=4)

# Train CBOW model
w2v_cbow = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, sg=0, min_count=2, workers=4)

# Train FastText model
ft_model = FastText(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, workers=4)

def get_custom_vector(review, model):
    words = review.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X_skipgram = np.array([get_custom_vector(text, w2v_skipgram) for text in tqdm(df['clean_review'])])
X_cbow = np.array([get_custom_vector(text, w2v_cbow) for text in tqdm(df['clean_review'])])
X_fasttext = np.array([get_custom_vector(text, ft_model) for text in tqdm(df['clean_review'])])

100%|██████████| 50000/50000 [00:07<00:00, 6647.73it/s]
  0%|          | 0/50000 [00:00<?, ?it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6911.04it/s]
  0%|          | 0/50000 [00:00<?, ?it/s]
100%|██████████| 50000/50000 [00:10<00:00, 4896.34it/s]



## Logistic Regression Evaluation on Custom Word2Vec Models

In [37]:
def train_and_evaluate(X, y, name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    return acc

acc_skipgram = train_and_evaluate(X_skipgram, y, 'Skip-gram')
acc_cbow = train_and_evaluate(X_cbow, y, 'CBOW')
acc_fasttext = train_and_evaluate(X_fasttext, y, 'FastText')


Skip-gram Accuracy: 0.8781
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4961
           1       0.88      0.88      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000


CBOW Accuracy: 0.8608
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      4961
           1       0.86      0.87      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000


CBOW Accuracy: 0.8608
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      4961
           1       0.86      0.87      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000


## Model Performance Summary

| Model         | Accuracy | F1 Score |
|---------------|----------|----------|
| Pre-trained W2V | 0.8515   | 0.85     |
| Skip-gram    | 0.8781   | 0.88     |
| CBOW         | 0.8608   | 0.86     |
| FastText     | 0.8481   | 0.85     |