In [1]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /home/anku/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anku/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anku/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/anku/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
document = """Natural Language Processing enables machines to understand human language. 
It includes text pre-processing techniques such as tokenization, stop word removal, 
stemming, and lemmatization. These techniques improve performance of machine learning models."""


In [4]:
tokens = word_tokenize(document)
print("Tokens:", tokens)


Tokens: ['Natural', 'Language', 'Processing', 'enables', 'machines', 'to', 'understand', 'human', 'language', '.', 'It', 'includes', 'text', 'pre-processing', 'techniques', 'such', 'as', 'tokenization', ',', 'stop', 'word', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.', 'These', 'techniques', 'improve', 'performance', 'of', 'machine', 'learning', 'models', '.']


In [5]:
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('enables', 'VBZ'), ('machines', 'NNS'), ('to', 'TO'), ('understand', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('includes', 'VBZ'), ('text', 'JJ'), ('pre-processing', 'JJ'), ('techniques', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('stop', 'VB'), ('word', 'NN'), ('removal', 'NN'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.'), ('These', 'DT'), ('techniques', 'NNS'), ('improve', 'VBP'), ('performance', 'NN'), ('of', 'IN'), ('machine', 'NN'), ('learning', 'NN'), ('models', 'NNS'), ('.', '.')]


In [6]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
print("After Stop Word Removal:", filtered_tokens)


After Stop Word Removal: ['Natural', 'Language', 'Processing', 'enables', 'machines', 'understand', 'human', 'language', 'includes', 'text', 'techniques', 'tokenization', 'stop', 'word', 'removal', 'stemming', 'lemmatization', 'techniques', 'improve', 'performance', 'machine', 'learning', 'models']


In [7]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Words:", stemmed)


Stemmed Words: ['natur', 'languag', 'process', 'enabl', 'machin', 'understand', 'human', 'languag', 'includ', 'text', 'techniqu', 'token', 'stop', 'word', 'remov', 'stem', 'lemmat', 'techniqu', 'improv', 'perform', 'machin', 'learn', 'model']


In [8]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Words:", lemmatized)


Lemmatized Words: ['Natural', 'Language', 'Processing', 'enables', 'machine', 'understand', 'human', 'language', 'includes', 'text', 'technique', 'tokenization', 'stop', 'word', 'removal', 'stemming', 'lemmatization', 'technique', 'improve', 'performance', 'machine', 'learning', 'model']


In [10]:
cv = CountVectorizer()
tf_matrix = cv.fit_transform([document])
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=cv.get_feature_names_out())
print("Term Frequency:\n", tf_df)


Term Frequency:
    and  as  enables  human  improve  includes  it  language  learning  \
0    1   1        1      1        1         1   1         2         1   

   lemmatization  ...  stemming  stop  such  techniques  text  these  to  \
0              1  ...         1     1     1           2     1      1   1   

   tokenization  understand  word  
0             1           1     1  

[1 rows x 29 columns]


In [11]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([document])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("TF-IDF:\n", tfidf_df)


TF-IDF:
         and        as   enables     human   improve  includes        it  \
0  0.162221  0.162221  0.162221  0.162221  0.162221  0.162221  0.162221   

   language  learning  lemmatization  ...  stemming      stop      such  \
0  0.324443  0.162221       0.162221  ...  0.162221  0.162221  0.162221   

   techniques      text     these        to  tokenization  understand  \
0    0.324443  0.162221  0.162221  0.162221      0.162221    0.162221   

       word  
0  0.162221  

[1 rows x 29 columns]
