In [3]:
%pip install scikit-learn
%pip install pandas  
%pip install numpy
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.



In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter
import matplotlib.pyplot as plt

# Load the processed reviews
df = pd.read_csv('processed_reviews.csv')
print("Loaded processed reviews:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()
# First create vocabulary from your data
odf = df.copy()
reviews_text = odf['Review'].astype(str)

# Build vocabulary from all reviews
all_words = []
for review in reviews_text:
    words = review.split()
    all_words.extend(words)

vocabulary = sorted(list(set(all_words)))
print(f"Total vocabulary size: {len(vocabulary)}")
print(f"First 20 words: {vocabulary[:20]}")

Loaded processed reviews:
Shape: (1000, 1)
Columns: ['Review']
Total vocabulary size: 169
First 20 words: ['acceptable', 'acre', 'actually', 'ad', 'amazing', 'ann', 'annoy', 'annoying', 'average', 'aweesome', 'awesoe', 'awesome', 'awesomee', 'awesomme', 'awesoome', 'awessome', 'awestme', 'awful', 'awmosme', 'awseome']


##### One hot encoding
It’s used to convert categorical data, such as words or the characters into a binary vector representation. Each unique category (word or character) is represented by a binary vector where only one element is “hot” (set to 1), while all others are “cold” (set to 0).


In [5]:
# Now apply to your actual data
print("Applying to your review data:")

# Use first 3 reviews and first 20 words from vocabulary
sample_reviews = reviews_text.head(3)
small_vocab = vocabulary[:20]

print(f"Using vocabulary: {small_vocab}")
print()

for i, review in enumerate(sample_reviews):
    # Create one-hot vector
    one_hot_vector = [0] * len(small_vocab)
    
    words = review.split()
    for word in words:
        if word in small_vocab:
            word_index = small_vocab.index(word)
            one_hot_vector[word_index] = 1
    
    print(f"Review {i+1}: {review[:50]}...")
    print(f"One-hot vector: {one_hot_vector}")
    print(f"Words found: {sum(one_hot_vector)}")
    print()

Applying to your review data:
Using vocabulary: ['acceptable', 'acre', 'actually', 'ad', 'amazing', 'ann', 'annoy', 'annoying', 'average', 'aweesome', 'awesoe', 'awesome', 'awesomee', 'awesomme', 'awesoome', 'awessome', 'awestme', 'awful', 'awmosme', 'awseome']

Review 1: excellent experience service org...
One-hot vector: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Words found: 0

Review 2: horrible experience flight like fuck...
One-hot vector: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Words found: 0

Review 3: amazing experience service...
One-hot vector: [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Words found: 1



I dont think One Hot encoding would be a good choice for the sentiment analysis project...

## Index Based Encoding
Index-based encoding is a simple text encoding technique where each word in the vocabulary is assigned a unique integer index. Instead of using binary vectors like one-hot encoding, we represent text as sequences of these integer indices.

How it works:

Create a vocabulary dictionary mapping words to unique numbers
Convert each text into a sequence of numbers
Each word becomes its corresponding index number
Example:

Advantages over One-Hot:

Memory efficient - stores integers instead of long binary vectors
Faster processing - smaller data size
Variable length - can handle texts of different lengths naturally
Disadvantages:

No semantic meaning - index 5 isn't "more" than index 2
Order dependency - models might think higher indices are more important
Padding needed - for fixed-length inputs in some models


In [6]:
print("Index Based Encoding")

# Create word to index mapping
word_to_index = {}
for i, word in enumerate(vocabulary):
    word_to_index[word] = i

print("Sample word mappings:")
for i in range(10):
    word = vocabulary[i]
    print(f"'{word}' -> {i}")

print()

# Encode fpr the first 3 reviews
for i in range(3):
    review = reviews_text.iloc[i]
    words = review.split()
    
    indices = []
    for word in words:
        if word in word_to_index:
            indices.append(word_to_index[word])
    
    print(f"Review {i+1}: {review[:30]}...")
    print(f"Indices: {indices[:5]}...")
    print(f"Total words: {len(indices)}")
    print()

Index Based Encoding
Sample word mappings:
'acceptable' -> 0
'acre' -> 1
'actually' -> 2
'ad' -> 3
'amazing' -> 4
'ann' -> 5
'annoy' -> 6
'annoying' -> 7
'average' -> 8
'aweesome' -> 9

Review 1: excellent experience service o...
Indices: [40, 41, 136, 106]...
Total words: 4

Review 2: horrible experience flight lik...
Indices: [64, 41, 50, 74, 54]...
Total words: 5

Review 3: amazing experience service...
Indices: [4, 41, 136]...
Total words: 3



### BAG OF WORDS (BOW)
BAG OF WORDS (BOW)
Bag of Words is a text encoding technique that represents text as a collection (or "bag") of words, ignoring grammar and word order but keeping track of word frequency. It's called a "bag" because it treats text like throwing words into a bag - you know what words are there and how many times each appears, but not their order.

How it works:

Create a vocabulary from all documents
For each document, count how many times each word appears
Represent each document as a vector of word counts

Example:
Review 1: "good product good quality"
Review 2: "bad product terrible quality"

Vocabulary: ["bad", "good", "product", "quality", "terrible"]

Review 1 BOW: [0, 2, 1, 1, 0]  # good appears 2 times
Review 2 BOW: [1, 0, 1, 1, 1]  # each word appears 1 time

Advantages:

Captures word frequency - "very very good" vs "good" are different
Simple to understand and implement
Works well for many text classification tasks
Better than one-hot for sentiment analysis
Disadvantages:

Ignores word order - "good bad" vs "bad good" look the same
No semantic meaning - doesn't know "good" and "great" are similar
Sparse vectors - most values are 0 for large vocabularies

In [7]:
print("Bag of Words Encoding:")

sample_reviews = reviews_text.head(3)
small_vocab = vocabulary[:20]

print(f"Vocabulary: {small_vocab}")
print()

for i, review in enumerate(sample_reviews):
    # Create count vector
    bow_vector = [0] * len(small_vocab)
    
    words = review.split()
    for word in words:
        if word in small_vocab:
            word_index = small_vocab.index(word)
            bow_vector[word_index] += 1  # Count frequency
    
    print(f"Review {i+1}: {review[:40]}...")
    print(f"BOW vector: {bow_vector}")
    print(f"Total word count: {sum(bow_vector)}")
    print()

Bag of Words Encoding:
Vocabulary: ['acceptable', 'acre', 'actually', 'ad', 'amazing', 'ann', 'annoy', 'annoying', 'average', 'aweesome', 'awesoe', 'awesome', 'awesomee', 'awesomme', 'awesoome', 'awessome', 'awestme', 'awful', 'awmosme', 'awseome']

Review 1: excellent experience service org...
BOW vector: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Total word count: 0

Review 2: horrible experience flight like fuck...
BOW vector: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Total word count: 0

Review 3: amazing experience service...
BOW vector: [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Total word count: 1



### TF-IDF Encoding (Term Frequency-Inverse Document Frequency)
TF-IDF is kinda a sophisticated text encoding tchnq which considers both the how a word appears frequently in the document (TF wala part) and how rare or common the word is across all the documents rey k hai(IDF wala part). so what it does is ki it helps identify the words that r imp to a specific document

How it works:
TF (Term Frequency): How often a word appears in a document
IDF (Inverse Document Frequency): How rare/common a word is across all documents
TF-IDF = TF × IDF: Words that appear frequently in one document but rarely in others get higher scores

Example:
Document 1: "good good product"
Document 2: "bad product"
Word "good": appears 2 times in doc1, 0 times in doc2 → high TF-IDF in doc1
Word "product": appears in both docs → lower IDF score

Advantages:

Identifies important words - filters out common words like "the", "and"
Better for sentiment analysis - emphasizes distinctive words
Reduces noise - common words get lower weights
Disadvantages:

More complex than simple counting
Still ignores word order
Can be computationally expensive


In [8]:
print("TF-IDF using sklearn:")

from sklearn.feature_extraction.text import TfidfVectorizer

# Creating TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=100,  # Use top 100 words
    stop_words='english'  # Remove common English words
)

# Fit and transform the reviews
tfidf_matrix = tfidf_vectorizer.fit_transform(reviews_text)

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Feature names (first 10): {tfidf_vectorizer.get_feature_names_out()[:10]}")

# Show sample TF-IDF vectors
sample_tfidf = tfidf_matrix[:3].toarray()

for i in range(3):
    print(f"Review {i+1} TF-IDF vector (first 10 features):")
    print(f"{sample_tfidf[i][:10]}")
    print(f"Non-zero elements: {np.count_nonzero(sample_tfidf[i])}")
    print()

TF-IDF using sklearn:
TF-IDF matrix shape: (1000, 100)
Feature names (first 10): ['acceptable' 'actually' 'amazing' 'annoy' 'annoying' 'average' 'awesome'
 'awessome' 'awestme' 'awful']
Review 1 TF-IDF vector (first 10 features):
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Non-zero elements: 4

Review 2 TF-IDF vector (first 10 features):
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Non-zero elements: 5

Review 3 TF-IDF vector (first 10 features):
[0.         0.         0.70482456 0.         0.         0.
 0.         0.         0.         0.        ]
Non-zero elements: 3



## Word2Vector Encoding:
Word2Vector (Word2Vec) is a neural network-based technique that converts words into dense vector representations where words with similar meanings have similar vector representations. Unlike previous methods, Word2Vec captures semantic relationships between words.

How it works:

Uses neural networks to learn word representations from large text corpora
Maps each word to a dense vector (typically 100-300 dimensions)
Words with similar contexts get similar vectors
Captures semantic relationships like "king - man + woman = queen"
Two main approaches:

CBOW (Continuous Bag of Words): Predicts target word from context words
Skip-gram: Predicts context words from target word
Advantages:

Captures semantic meaning - "good" and "great" have similar vectors
Dense vectors - no sparse data issues
Semantic relationships - can do word arithmetic
Transfer learning - can use pre-trained models
Disadvantages:

Requires large datasets for training
Fixed vocabulary - can't handle new words
No word order - still ignores sentence structure
Computationally expensive to train


In [11]:
# Uninstall everything and reinstall compatible versions
%pip uninstall -y gensim numpy scipy
%pip install numpy==1.24.3 scipy==1.10.1 gensim==4.3.2

Found existing installation: gensim 4.3.2
Uninstalling gensim-4.3.2:
  Successfully uninstalled gensim-4.3.2
Found existing installation: numpy 1.24.3
Uninstalling numpy-1.24.3:
  Successfully uninstalled numpy-1.24.3
Found existing installation: scipy 1.13.1
Uninstalling scipy-1.13.1:
  Successfully uninstalled scipy-1.13.1
Note: you may need to restart the kernel to use updated packages.


You can safely remove it manually.
You can safely remove it manually.
You can safely remove it manually.


Collecting numpy==1.24.3
  Using cached numpy-1.24.3-cp310-cp310-win_amd64.whl.metadata (5.6 kB)
Collecting scipy==1.10.1
  Downloading scipy-1.10.1-cp310-cp310-win_amd64.whl.metadata (58 kB)
Collecting gensim==4.3.2
  Using cached gensim-4.3.2-cp310-cp310-win_amd64.whl.metadata (8.5 kB)
Using cached numpy-1.24.3-cp310-cp310-win_amd64.whl (14.8 MB)
Downloading scipy-1.10.1-cp310-cp310-win_amd64.whl (42.5 MB)
   ---------------------------------------- 0.0/42.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/42.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/42.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/42.5 MB ? eta -:--:--
    --------------------------------------- 1.0/42.5 MB 1.9 MB/s eta 0:00:23
   - -------------------------------------- 1.6/42.5 MB 2.1 MB/s eta 0:00:20
   - -------------------------------------- 1.8/42.5 MB 2.1 MB/s eta 0:00:20
   -- ------------------------------------- 3.1/42.5 MB 2.9 MB/s eta 0:00:14


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.


In [16]:
import numpy as np
from collections import defaultdict

# Build cooccurrence matrix
def build_cooccurrence_matrix(sentences, vocab, window_size=2):
    vocab_to_idx = {word: i for i, word in enumerate(vocab)}
    matrix = np.zeros((len(vocab), len(vocab)))
    
    for sentence in sentences:
        words = sentence.split()
        for i, word in enumerate(words):
            if word in vocab_to_idx:
                target_idx = vocab_to_idx[word]
                # Look at surrounding words
                start = max(0, i - window_size)
                end = min(len(words), i + window_size + 1)
                
                for j in range(start, end):
                    if i != j and words[j] in vocab_to_idx:
                        context_idx = vocab_to_idx[words[j]]
                        matrix[target_idx][context_idx] += 1
    
    return matrix


small_vocab = vocabulary[:50]  # first 50 words
sample_sentences = reviews_text.head(100).tolist()  # first 100 reviews

print(f"Building cooccurrence matrix for {len(small_vocab)} words...")
cooc_matrix = build_cooccurrence_matrix(sample_sentences, small_vocab)

print(f"Cooccurrence matrix shape: {cooc_matrix.shape}")

Building cooccurrence matrix for 50 words...
Cooccurrence matrix shape: (50, 50)


In [17]:
from sklearn.decomposition import TruncatedSVD

print("Creating word vectors using SVD:")

# Applying SVD to reduce dimensions
n_components = 50  # 50-dimensional vectors
svd = TruncatedSVD(n_components=n_components)
word_vectors = svd.fit_transform(cooc_matrix)

print(f"Word vectors shape: {word_vectors.shape}")

# Create word to vector mapping
word_to_vector = {}
for i, word in enumerate(small_vocab):
    word_to_vector[word] = word_vectors[i]

# Show some word vectors
print("\nSample word vectors:")
for word in ['good', 'bad', 'product', 'quality']:
    if word in word_to_vector:
        vector = word_to_vector[word]
        print(f"{word}: {vector[:3]}...")  # first 3 dimensions
    else:
        print(f"{word}: not in vocab")

Creating word vectors using SVD:
Word vectors shape: (50, 50)

Sample word vectors:
good: not in vocab
bad: [1.87918622 1.33824214 0.32252031]...
product: not in vocab
quality: not in vocab


In [None]:
# Calculate word similarities
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return dot_product / (norm1 * norm2)

print("Word similarities:")
if 'awesome' in word_to_vector:
    awesome_vec = word_to_vector['awesome']
    similarities = []
    
    for word in small_vocab:
        if word != 'awesome' and word in word_to_vector:
            sim = cosine_similarity(awesome_vec, word_to_vector[word])
            similarities.append((word, sim))
    
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    print("Words most similar to 'awesome':")
    for word, sim in similarities[:5]:
        print(f"{word}: {sim:.3f}")

Word similarities:
