In [1]:
pip install scikit-learn gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [3]:
corpus = [
    "The cat sat on the mat",
    "The dog sat on the log",
    "Cats and dogs are great"
]

In [4]:
# --- Count Occurrence ---
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(corpus)

# Convert to DataFrame for readability
tokens = vectorizer.get_feature_names_out()
df_bow = pd.DataFrame(bow_matrix.toarray(), columns=tokens)

In [5]:
print("--- Bag of Words (Count Occurrence) ---")
print(df_bow)

--- Bag of Words (Count Occurrence) ---
   and  are  cat  cats  dog  dogs  great  log  mat  on  sat  the
0    0    0    1     0    0     0      0    0    1   1    1    2
1    0    0    0     0    1     0      0    1    0   1    1    2
2    1    1    0     1    0     1      1    0    0   0    0    0


In [6]:
# --- Normalized Count Occurrence ---
# We divide each row by the sum of its elements
bow_array = bow_matrix.toarray()
normalized_bow = bow_array / bow_array.sum(axis=1)[:, None]

df_norm_bow = pd.DataFrame(normalized_bow, columns=tokens)

In [7]:
print("\n--- Normalized Bag of Words ---")
print(df_norm_bow)


--- Normalized Bag of Words ---
   and  are       cat  cats       dog  dogs  great       log       mat  \
0  0.0  0.0  0.166667   0.0  0.000000   0.0    0.0  0.000000  0.166667   
1  0.0  0.0  0.000000   0.0  0.166667   0.0    0.0  0.166667  0.000000   
2  0.2  0.2  0.000000   0.2  0.000000   0.2    0.2  0.000000  0.000000   

         on       sat       the  
0  0.166667  0.166667  0.333333  
1  0.166667  0.166667  0.333333  
2  0.000000  0.000000  0.000000  


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\n--- TF-IDF Matrix ---")
print(df_tfidf)


--- TF-IDF Matrix ---
        and       are       cat      cats       dog      dogs     great  \
0  0.000000  0.000000  0.427554  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.427554  0.000000  0.000000   
2  0.447214  0.447214  0.000000  0.447214  0.000000  0.447214  0.447214   

        log       mat        on       sat       the  
0  0.000000  0.427554  0.325166  0.325166  0.650331  
1  0.427554  0.000000  0.325166  0.325166  0.650331  
2  0.000000  0.000000  0.000000  0.000000  0.000000  


In [9]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

In [13]:
# Add these downloads at the top of your script
nltk.download('punkt')
nltk.download('punkt_tab') # This is the missing resource causing your error

data = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great pets",
    "i love my cat and my dog"
]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
# Preprocessing: Word2Vec expects a list of lists (tokenized sentences)
tokenized_data = [word_tokenize(sentence.lower()) for sentence in data]

# Initialize and Train Model
model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

In [15]:
print("Tokenized Data:", tokenized_data)
print("Vector for 'cat':", model.wv['cat'][:5]) # Displaying first 5 dimensions

Tokenized Data: [['the', 'cat', 'sat', 'on', 'the', 'mat'], ['the', 'dog', 'sat', 'on', 'the', 'log'], ['cats', 'and', 'dogs', 'are', 'great', 'pets'], ['i', 'love', 'my', 'cat', 'and', 'my', 'dog']]
Vector for 'cat': [ 0.00813227 -0.00445733 -0.00106836  0.00100636 -0.00019111]
