# Install the required library

In [62]:
!pip install nltk scikit-learn




# Import the required library

In [33]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download the required resources

In [34]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinod\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vinod\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vinod\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Sample document

In [35]:
text = "Text analytics is the process of converting unstructured text data into meaningful insights."

# Tokenization

In [70]:
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['Text', 'analytics', 'is', 'the', 'process', 'of', 'converting', 'unstructured', 'text', 'data', 'into', 'meaningful', 'insights', '.']


# POS tagging

In [37]:
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('converting', 'VBG'), ('unstructured', 'JJ'), ('text', 'NN'), ('data', 'NNS'), ('into', 'IN'), ('meaningful', 'JJ'), ('insights', 'NNS'), ('.', '.')]


# Stop word Removal

In [38]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens:", filtered_tokens)

Filtered Tokens: ['Text', 'analytics', 'process', 'converting', 'unstructured', 'text', 'data', 'meaningful', 'insights', '.']


# Stemming

In [39]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Words:", stemmed)

Stemmed Words: ['text', 'analyt', 'process', 'convert', 'unstructur', 'text', 'data', 'meaning', 'insight', '.']


# Lemmatization

In [40]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Words:", lemmatized)

Lemmatized Words: ['Text', 'analytics', 'process', 'converting', 'unstructured', 'text', 'data', 'meaningful', 'insight', '.']


# TF-IDF Calculation

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
corpus = [
    "Text analytics is the process of converting unstructured text data into meaningful insights.",
    "Data analytics helps in business decisions.",
    "Natural language processing is a part of text analytics."
]

In [43]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [44]:
import pandas as pd

In [45]:
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(df_tfidf)

   analytics  business  converting      data  decisions     helps        in  \
0   0.174944  0.000000    0.296205  0.225272   0.000000  0.000000  0.000000   
1   0.266075  0.450504    0.000000  0.342620   0.450504  0.450504  0.450504   
2   0.239447  0.000000    0.000000  0.000000   0.000000  0.000000  0.000000   

   insights      into        is  language  meaningful   natural        of  \
0  0.296205  0.296205  0.225272  0.000000    0.296205  0.000000  0.225272   
1  0.000000  0.000000  0.000000  0.000000    0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.308332  0.405419    0.000000  0.405419  0.308332   

       part   process  processing      text       the  unstructured  
0  0.000000  0.296205    0.000000  0.450544  0.296205      0.296205  
1  0.000000  0.000000    0.000000  0.000000  0.000000      0.000000  
2  0.405419  0.000000    0.405419  0.308332  0.000000      0.000000  
