# NLP Assignment 2: Bag of Words, TF-IDF, and Word2Vec

This notebook demonstrates Bag-of-Words, TF-IDF, and Word2Vec embeddings using Python.

In [6]:

import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Sample Corpus

In [7]:

corpus = [
    "I love natural language processing",
    "Natural language processing is fun",
    "I love learning new AI techniques"
]

print(corpus)


['I love natural language processing', 'Natural language processing is fun', 'I love learning new AI techniques']


## Bag of Words (Count Occurrence)

In [8]:

from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(corpus)

print("Vocabulary:", count_vectorizer.get_feature_names_out())
print("Count Occurrence Matrix:\n", bow_counts.toarray())


Vocabulary: ['ai' 'fun' 'is' 'language' 'learning' 'love' 'natural' 'new' 'processing'
 'techniques']
Count Occurrence Matrix:
 [[0 0 0 1 0 1 1 0 1 0]
 [0 1 1 1 0 0 1 0 1 0]
 [1 0 0 0 1 1 0 1 0 1]]


## Bag of Words (Normalized Count Occurrence)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize # Import normalize function

# First, get the raw count occurrence matrix using CountVectorizer
# A new instance of CountVectorizer is created for this section to maintain clarity
count_vectorizer_for_norm = CountVectorizer()
raw_bow_counts = count_vectorizer_for_norm.fit_transform(corpus)

# Then, apply L1 normalization to the raw count matrix
# 'axis=1' normalizes each document's vector (row) independently
bow_normalized = normalize(raw_bow_counts, norm='l1', axis=1)

print("Vocabulary:", count_vectorizer_for_norm.get_feature_names_out())
print("Normalized BoW Matrix:\n", bow_normalized.toarray())

Vocabulary: ['ai' 'fun' 'is' 'language' 'learning' 'love' 'natural' 'new' 'processing'
 'techniques']
Normalized BoW Matrix:
 [[0.   0.   0.   0.25 0.   0.25 0.25 0.   0.25 0.  ]
 [0.   0.2  0.2  0.2  0.   0.   0.2  0.   0.2  0.  ]
 [0.2  0.   0.   0.   0.2  0.2  0.   0.2  0.   0.2 ]]


## TF-IDF

In [10]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


Vocabulary: ['ai' 'fun' 'is' 'language' 'learning' 'love' 'natural' 'new' 'processing'
 'techniques']
TF-IDF Matrix:
 [[0.         0.         0.         0.5        0.         0.5
  0.5        0.         0.5        0.        ]
 [0.         0.51741994 0.51741994 0.3935112  0.         0.
  0.3935112  0.         0.3935112  0.        ]
 [0.46735098 0.         0.         0.         0.46735098 0.35543247
  0.         0.46735098 0.         0.46735098]]


## Word2Vec Embeddings

In [13]:
!pip install gensim
import nltk
nltk.download('punkt_tab')
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

w2v_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=50,
    window=5,
    min_count=1,
    workers=2
)

print("Word Vector for 'language':\n", w2v_model.wv['language'])



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Word Vector for 'language':
 [-0.01631583  0.0089916  -0.00827415  0.00164907  0.01699724 -0.00892435
  0.009035   -0.01357392 -0.00709698  0.01879702 -0.00315531  0.00064274
 -0.00828126 -0.01536538 -0.00301602  0.00493959 -0.00177605  0.01106732
 -0.00548595  0.00452013  0.01091159  0.01669191 -0.00290748 -0.01841629
  0.0087411   0.00114357  0.01488382 -0.00162657 -0.00527683 -0.01750602
 -0.00171311  0.00565313  0.01080286  0.01410531 -0.01140624  0.00371764
  0.01217773 -0.0095961  -0.00621452  0.01359526  0.00326295  0.00037983
  0.00694727  0.00043555  0.01923765  0.01012121 -0.01783478 -0.01408312
  0.00180291  0.01278507]
