In [22]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Download tokenizer data if not already present
nltk.download('punkt')
nltk.download('punkt_tab')

# --- 1. Sample Data ---
corpus = [
    "The cat sat on the mat",
    "The dog sat on the log",
    "Cats and dogs are great pets",
    "The cat chased the mouse"
]

print("Original Corpus:")
for i, doc in enumerate(corpus):
    print(f"Doc {i}: {doc}")

Original Corpus:
Doc 0: The cat sat on the mat
Doc 1: The dog sat on the log
Doc 2: Cats and dogs are great pets
Doc 3: The cat chased the mouse


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [23]:
# --- 2. Preprocessing ---
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphabetic characters (optional, but good for cleanliness)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing
tokenized_corpus = [preprocess(doc) for doc in corpus]
# Join tokens back into strings for sklearn (it expects strings, not lists of tokens)
clean_corpus = [" ".join(tokens) for tokens in tokenized_corpus]

print("\nPreprocessed Data:", clean_corpus)


Preprocessed Data: ['the cat sat on the mat', 'the dog sat on the log', 'cats and dogs are great pets', 'the cat chased the mouse']


In [24]:
# --- 3. Bag-of-Words: Count Occurrence ---
# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the corpus
count_matrix = count_vectorizer.fit_transform(clean_corpus)

# Convert to DataFrame for readability
df_bow_count = pd.DataFrame(
    count_matrix.toarray(),
    columns=count_vectorizer.get_feature_names_out(),
    index=[f"Doc {i}" for i in range(len(corpus))]
)

print("\n--- Bag of Words (Count Occurrence) ---")
display(df_bow_count) # Use print(df_bow_count) if not in Jupyter


--- Bag of Words (Count Occurrence) ---


Unnamed: 0,and,are,cat,cats,chased,dog,dogs,great,log,mat,mouse,on,pets,sat,the
Doc 0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,2
Doc 1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,2
Doc 2,1,1,0,1,0,0,1,1,0,0,0,0,1,0,0
Doc 3,0,0,1,0,1,0,0,0,0,0,1,0,0,0,2


In [25]:
# --- 4. Bag-of-Words: Normalized ---
count_vectorizer_norm = CountVectorizer()
# Generate the sparse matrix
sparse_matrix = count_vectorizer_norm.fit_transform(clean_corpus)

# FIX: Convert to a dense numpy array immediately
dense_matrix = sparse_matrix.toarray()

# Calculate row sums (total words per document)
# keepdims=True ensures the shape matches for division (n_rows, 1)
row_sums = dense_matrix.sum(axis=1, keepdims=True)

# Avoid division by zero (replace 0s with 1s temporarily)
row_sums[row_sums == 0] = 1

# Divide the dense matrix by the row sums
normalized_matrix = dense_matrix / row_sums

# Create DataFrame
df_bow_norm = pd.DataFrame(
    normalized_matrix,
    columns=count_vectorizer_norm.get_feature_names_out(),
    index=[f"Doc {i}" for i in range(len(corpus))]
)

print("\n--- Bag of Words (Normalized / Term Frequency) ---")
display(df_bow_norm) # or print(df_bow_norm)


--- Bag of Words (Normalized / Term Frequency) ---


Unnamed: 0,and,are,cat,cats,chased,dog,dogs,great,log,mat,mouse,on,pets,sat,the
Doc 0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.166667,0.0,0.166667,0.333333
Doc 1,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.166667,0.333333
Doc 2,0.166667,0.166667,0.0,0.166667,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.166667,0.0,0.0
Doc 3,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.4


In [26]:
# --- 5. TF-IDF ---
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform
tfidf_matrix = tfidf_vectorizer.fit_transform(clean_corpus)

df_tfidf = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out(),
    index=[f"Doc {i}" for i in range(len(corpus))]
)

print("\n--- TF-IDF Matrix ---")
display(df_tfidf)


--- TF-IDF Matrix ---


Unnamed: 0,and,are,cat,cats,chased,dog,dogs,great,log,mat,mouse,on,pets,sat,the
Doc 0,0.0,0.0,0.371891,0.0,0.0,0.0,0.0,0.0,0.0,0.471697,0.0,0.371891,0.0,0.371891,0.602156
Doc 1,0.0,0.0,0.0,0.0,0.0,0.453012,0.0,0.0,0.453012,0.0,0.0,0.35716,0.0,0.35716,0.578303
Doc 2,0.408248,0.408248,0.0,0.408248,0.0,0.0,0.408248,0.408248,0.0,0.0,0.0,0.0,0.408248,0.0,0.0
Doc 3,0.0,0.0,0.38238,0.0,0.485001,0.0,0.0,0.0,0.0,0.0,0.485001,0.0,0.0,0.0,0.61914


In [27]:
# --- 6. Word2Vec Embeddings ---
# Word2Vec requires a list of token lists (not strings)
print("\n--- Word2Vec Embeddings ---")

# Train the model
# min_count=1 ensures even rare words in our small dataset are kept
# vector_size=10 creates a 10-dimensional vector for each word
model = Word2Vec(sentences=tokenized_corpus, vector_size=10, window=5, min_count=1, workers=4)

# Access the vector for a specific word (e.g., 'cat')
word = 'cat'
if word in model.wv:
    vector = model.wv[word]
    print(f"Vector for '{word}':\n{vector}")

    # Find most similar words (Note: In such a tiny dataset, similarity won't be very semantic)
    # in a real large dataset, 'dog' would be close to 'cat'
    similar = model.wv.most_similar(word)
    print(f"\nWords most similar to '{word}': {similar}")
else:
    print(f"Word '{word}' not in vocabulary.")

# You can also visualize the vocabulary
print(f"\nVocabulary: {list(model.wv.index_to_key)}")


--- Word2Vec Embeddings ---
Vector for 'cat':
[-0.07510829 -0.00928377  0.09537202 -0.07321425 -0.02331843 -0.01937828
  0.08079723 -0.05931142  0.00044919 -0.04753539]

Words most similar to 'cat': [('great', 0.2941974997520447), ('are', 0.2070571780204773), ('and', 0.19901864230632782), ('the', 0.10485928505659103), ('chased', 0.0927044004201889), ('log', 0.057584166526794434), ('sat', -0.1054532453417778), ('pets', -0.11386623233556747), ('dog', -0.19235801696777344), ('on', -0.2113596796989441)]

Vocabulary: ['the', 'on', 'sat', 'cat', 'mouse', 'chased', 'pets', 'great', 'are', 'dogs', 'and', 'cats', 'log', 'dog', 'mat']
