In [1]:
!pip install scikit-learn gensim nltk

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
corpus = [
    "I love NLP",
    "I love coding",
    "I love NLP and coding"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
df_bow = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print("--- Bag of Words (Count) ---")
print(df_bow)

--- Bag of Words (Count) ---
   and  coding  love  nlp
0    0       0     1    1
1    0       1     1    0
2    1       1     1    1


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(use_idf=False, norm='l1')
X_norm = tf_vectorizer.fit_transform(corpus)
df_norm = pd.DataFrame(X_norm.toarray(), columns=tf_vectorizer.get_feature_names_out())
print("\n--- Normalized Counts (Term Frequency) ---")
print(df_norm)


--- Normalized Counts (Term Frequency) ---
    and  coding  love   nlp
0  0.00    0.00  0.50  0.50
1  0.00    0.50  0.50  0.00
2  0.25    0.25  0.25  0.25


In [14]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\n--- TF-IDF Matrix ---")
print(df_tfidf)


--- TF-IDF Matrix ---
        and    coding      love       nlp
0  0.000000  0.000000  0.613356  0.789807
1  0.000000  0.789807  0.613356  0.000000
2  0.631745  0.480458  0.373119  0.480458


In [15]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
corpus = [
    "I love NLP",
    "I love coding",
    "I love NLP and coding"
]
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]
print("Tokens:", tokenized_corpus)
model = Word2Vec(sentences=tokenized_corpus, vector_size=10, window=2, min_count=1, workers=4)
word_vector = model.wv['coding']
print("\nVector for 'coding':", word_vector)

similarity = model.wv.similarity('nlp', 'coding')
print(f"Similarity between 'nlp' and 'coding': {similarity:.4f}")

Tokens: [['i', 'love', 'nlp'], ['i', 'love', 'coding'], ['i', 'love', 'nlp', 'and', 'coding']]

Vector for 'coding': [ 0.07311766  0.05070262  0.06757693  0.00762866  0.06350891 -0.03405366
 -0.00946401  0.05768573 -0.07521638 -0.03936104]
Similarity between 'nlp' and 'coding': -0.1055


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
