In [1]:
import nltk
import string
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\avart\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avart\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\avart\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\avart\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Define a sample document
document = """
Natural language processing (NLP) is a field of artificial intelligence (AI) that enables machines to understand and respond to human language.
It involves tasks such as tokenization, POS tagging, stop word removal, stemming, and lemmatization.
"""
print(document)



Natural language processing (NLP) is a field of artificial intelligence (AI) that enables machines to understand and respond to human language.
It involves tasks such as tokenization, POS tagging, stop word removal, stemming, and lemmatization.



In [3]:
# Tokenize into words
tokens = word_tokenize(document)
print("Tokens:", tokens)


Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', '(', 'AI', ')', 'that', 'enables', 'machines', 'to', 'understand', 'and', 'respond', 'to', 'human', 'language', '.', 'It', 'involves', 'tasks', 'such', 'as', 'tokenization', ',', 'POS', 'tagging', ',', 'stop', 'word', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', '.']


In [4]:
# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)


POS Tags: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('field', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('(', '('), ('AI', 'NNP'), (')', ')'), ('that', 'IN'), ('enables', 'VBZ'), ('machines', 'NNS'), ('to', 'TO'), ('understand', 'VB'), ('and', 'CC'), ('respond', 'VB'), ('to', 'TO'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('tasks', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('POS', 'NNP'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('word', 'NN'), ('removal', 'NN'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]


In [5]:
# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
print("Filtered Tokens:", filtered_tokens)


Filtered Tokens: ['Natural', 'language', 'processing', 'NLP', 'field', 'artificial', 'intelligence', 'AI', 'enables', 'machines', 'understand', 'respond', 'human', 'language', 'involves', 'tasks', 'tokenization', 'POS', 'tagging', 'stop', 'word', 'removal', 'stemming', 'lemmatization']


In [6]:
# Apply Porter Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)


Stemmed Tokens: ['natur', 'languag', 'process', 'nlp', 'field', 'artifici', 'intellig', 'ai', 'enabl', 'machin', 'understand', 'respond', 'human', 'languag', 'involv', 'task', 'token', 'po', 'tag', 'stop', 'word', 'remov', 'stem', 'lemmat']


In [7]:
# Apply WordNet Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)


Lemmatized Tokens: ['Natural', 'language', 'processing', 'NLP', 'field', 'artificial', 'intelligence', 'AI', 'enables', 'machine', 'understand', 'respond', 'human', 'language', 'involves', 'task', 'tokenization', 'POS', 'tagging', 'stop', 'word', 'removal', 'stemming', 'lemmatization']


In [12]:
# Create sample corpus (multiple documents)
corpus = [
    document,
    "Artificial intelligence includes machine learning and deep learning.",
    "NLP is useful for chatbots, search engines, and virtual assistants."
]

# TF-IDF Calculation
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Display TF-IDF matrix as DataFrame
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_df)


TF-IDF Matrix:
          ai       and  artificial        as  assistants  chatbots      deep  \
0  0.167345  0.197673    0.127270  0.167345    0.000000  0.000000  0.000000   
1  0.000000  0.202513    0.260772  0.000000    0.000000  0.000000  0.342884   
2  0.000000  0.202513    0.000000  0.000000    0.342884  0.342884  0.000000   

    enables   engines     field  ...      such   tagging     tasks      that  \
0  0.167345  0.000000  0.167345  ...  0.167345  0.167345  0.167345  0.167345   
1  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000  0.000000   
2  0.000000  0.342884  0.000000  ...  0.000000  0.000000  0.000000  0.000000   

         to  tokenization  understand    useful   virtual      word  
0  0.334689      0.167345    0.167345  0.000000  0.000000  0.167345  
1  0.000000      0.000000    0.000000  0.000000  0.000000  0.000000  
2  0.000000      0.000000    0.000000  0.342884  0.342884  0.000000  

[3 rows x 42 columns]
