In [None]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Download required NLTK resources (run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yashg_t6wet39\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashg_t6wet39\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yashg_t6wet39\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yashg_t6wet39\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
 # Step 1: Sample document
doc = "This is a sample document for demonstrating text preprocessing. It includes multiple sentences and some common words like is, a, for. We will perform tokenization, POS tagging, stop words removal, stemming, and lemmatization on this document. The goal is to understand the basic steps involved in preparing text data for further analysis."
print("Original Document:\n", doc)
 
 # Step 2: Tokenization
tokens = word_tokenize(doc)
print("\nTokenized Words:\n", tokens)

 # Step 3: POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("\nPOS Tags:\n", pos_tags)

 # Step 4: Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("\nTokens after Stopword Removal:\n", filtered_tokens)

#Step 5: Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemmed Words:\n", stemmed)

 # Step 6: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens] 
print("\nLemmatized Words:\n", lemmatized)

Original Document:
 This is a sample document for demonstrating text preprocessing. It includes multiple sentences and some common words like is, a, for. We will perform tokenization, POS tagging, stop words removal, stemming, and lemmatization on this document. The goal is to understand the basic steps involved in preparing text data for further analysis.

Tokenized Words:
 ['This', 'is', 'a', 'sample', 'document', 'for', 'demonstrating', 'text', 'preprocessing', '.', 'It', 'includes', 'multiple', 'sentences', 'and', 'some', 'common', 'words', 'like', 'is', ',', 'a', ',', 'for', '.', 'We', 'will', 'perform', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', 'on', 'this', 'document', '.', 'The', 'goal', 'is', 'to', 'understand', 'the', 'basic', 'steps', 'involved', 'in', 'preparing', 'text', 'data', 'for', 'further', 'analysis', '.']

POS Tags:
 [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('document'

In [3]:
 # Step 7: TF and TF-IDF
 # Sample corpus
corpus = [
"This is a sample document for demonstrating text preprocessing.",
"It includes multiple sentences and some common words like is, a, for.",
"We will perform tokenization, POS tagging, stop words removal, stemming, and lemmatization on this document.",
"The goal is to understand the basic steps involved in preparing text data for further analysis."
]

 # Term Frequency (TF)
cv = CountVectorizer()
X_tf = cv.fit_transform(corpus)
tf_df = pd.DataFrame(X_tf.toarray(), columns=cv.get_feature_names_out())
print("\nTerm Frequency (TF):\n", tf_df)

# Inverse Document Frequency (TF-IDF)
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus)
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print("\nTF-IDF:\n", tfidf_df)


Term Frequency (TF):
    analysis  and  basic  common  data  demonstrating  document  for  further  \
0         0    0      0       0     0              1         1    1        0   
1         0    1      0       1     0              0         0    1        0   
2         0    1      0       0     0              0         1    0        0   
3         1    0      1       0     1              0         0    1        1   

   goal  ...  tagging  text  the  this  to  tokenization  understand  we  \
0     0  ...        0     1    0     1   0             0           0   0   
1     0  ...        0     0    0     0   0             0           0   0   
2     0  ...        1     0    0     1   0             1           0   1   
3     1  ...        0     1    2     0   1             0           1   0   

   will  words  
0     0      0  
1     0      1  
2     1      1  
3     0      0  

[4 rows x 40 columns]

TF-IDF:
    analysis       and     basic    common      data  demonstrating  document 