In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
corpus = [
    "I am loving the NLP class, but sometimes it feels confusing!!!",
    "NLP is a fascinating field — it deals with text, speech, and language understanding."
]

In [9]:
print (corpus)

['I am loving the NLP class, but sometimes it feels confusing!!!', 'NLP is a fascinating field — it deals with text, speech, and language understanding.']


In [10]:
vectorizer = CountVectorizer()

In [11]:
X = vectorizer.fit_transform(corpus)

In [12]:
print("Vocabulary:\n", vectorizer.get_feature_names_out())

Vocabulary:
 ['am' 'and' 'but' 'class' 'confusing' 'deals' 'fascinating' 'feels'
 'field' 'is' 'it' 'language' 'loving' 'nlp' 'sometimes' 'speech' 'text'
 'the' 'understanding' 'with']


In [13]:
print("\nBag of Words Matrix:\n", X.toarray())


Bag of Words Matrix:
 [[1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0]
 [0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1]]


In [14]:
# -------------------------------
# Q.2: Vocabulary & Bag of Words
# -------------------------------


from sklearn.feature_extraction.text import CountVectorizer

# Given Corpus
corpus = [
    "I am loving the NLP class, but sometimes it feels confusing!!!",
    "NLP is a fascinating field — it deals with text, speech, and language understanding."
]

print("Original Corpus:\n", corpus)

# Step 1: Initialize CountVectorizer
# This automatically does: tokenization, lowercasing, and punctuation removal
vectorizer = CountVectorizer()

# Step 2: Fit the corpus and transform into Bag-of-Words
X = vectorizer.fit_transform(corpus)

# Step 3: Vocabulary (unique words in corpus)
vocab = vectorizer.get_feature_names_out()
print("\nVocabulary:\n", vocab)

# Step 4: Bag-of-Words Matrix
print("\nBag-of-Words Representation:\n")
print(X.toarray())


Original Corpus:
 ['I am loving the NLP class, but sometimes it feels confusing!!!', 'NLP is a fascinating field — it deals with text, speech, and language understanding.']

Vocabulary:
 ['am' 'and' 'but' 'class' 'confusing' 'deals' 'fascinating' 'feels'
 'field' 'is' 'it' 'language' 'loving' 'nlp' 'sometimes' 'speech' 'text'
 'the' 'understanding' 'with']

Bag-of-Words Representation:

[[1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0]
 [0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1]]


In [15]:
# -------------------------------
# Q.2 Extended: TF-IDF on Corpus
# -------------------------------

from sklearn.feature_extraction.text import TfidfVectorizer

# Given Corpus
corpus = [
    "I am loving the NLP class, but sometimes it feels confusing!!!",
    "NLP is a fascinating field — it deals with text, speech, and language understanding."
]

print("Original Corpus:\n", corpus)

# Step 1: Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Step 2: Fit and transform corpus into TF-IDF matrix
X = vectorizer.fit_transform(corpus)

# Step 3: Get Vocabulary
vocab = vectorizer.get_feature_names_out()
print("\nVocabulary:\n", vocab)

# Step 4: TF-IDF Matrix
print("\nTF-IDF Representation (rounded):\n")
print(X.toarray().round(3))


Original Corpus:
 ['I am loving the NLP class, but sometimes it feels confusing!!!', 'NLP is a fascinating field — it deals with text, speech, and language understanding.']

Vocabulary:
 ['am' 'and' 'but' 'class' 'confusing' 'deals' 'fascinating' 'feels'
 'field' 'is' 'it' 'language' 'loving' 'nlp' 'sometimes' 'speech' 'text'
 'the' 'understanding' 'with']

TF-IDF Representation (rounded):

[[0.333 0.    0.333 0.333 0.333 0.    0.    0.333 0.    0.    0.237 0.
  0.333 0.237 0.333 0.    0.    0.333 0.    0.   ]
 [0.    0.301 0.    0.    0.    0.301 0.301 0.    0.301 0.301 0.214 0.301
  0.    0.214 0.    0.301 0.301 0.    0.301 0.301]]


In [16]:
# -------------------------------
# Q.2 Extension: TF-IDF Example
# -------------------------------

from sklearn.feature_extraction.text import TfidfVectorizer

# Corpus (2 documents)
corpus = [
    "I am loving the NLP class, but sometimes it feels confusing!!!",
    "NLP is a fascinating field — it deals with text, speech, and language understanding."
]

print("Original Corpus:\n", corpus)

# Step 1: Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Step 2: Fit and transform the corpus
X = tfidf.fit_transform(corpus)

# Step 3: Vocabulary
vocab = tfidf.get_feature_names_out()
print("\nVocabulary (Features):\n", vocab)

# Step 4: TF-IDF Matrix
print("\nTF-IDF Representation (Matrix):\n")
print(X.toarray())


Original Corpus:
 ['I am loving the NLP class, but sometimes it feels confusing!!!', 'NLP is a fascinating field — it deals with text, speech, and language understanding.']

Vocabulary (Features):
 ['am' 'and' 'but' 'class' 'confusing' 'deals' 'fascinating' 'feels'
 'field' 'is' 'it' 'language' 'loving' 'nlp' 'sometimes' 'speech' 'text'
 'the' 'understanding' 'with']

TF-IDF Representation (Matrix):

[[0.33310232 0.         0.33310232 0.33310232 0.33310232 0.
  0.         0.33310232 0.         0.         0.23700504 0.
  0.33310232 0.23700504 0.33310232 0.         0.         0.33310232
  0.         0.        ]
 [0.         0.30134034 0.         0.         0.         0.30134034
  0.30134034 0.         0.30134034 0.30134034 0.21440614 0.30134034
  0.         0.21440614 0.         0.30134034 0.30134034 0.
  0.30134034 0.30134034]]
