Text Analytics

1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document
Frequency.


In [2]:
import nltk # The Natural Language Toolkit library, which provides functions for text processing.
from nltk import pos_tag # Function to perform Part-of-Speech tagging.
from nltk.tokenize import TreebankWordTokenizer # A tokenizer that splits text into words.
from nltk.stem import PorterStemmer,WordNetLemmatizer # A stemmer that reduces words to their base or root form (e.g., "running" → "run") A lemmatizer that reduces words to their dictionary form (e.g., "running" → "run").
from nltk.corpus import stopwords # A list of common words (like "the", "is") that are generally ignored in text analysis.
from sklearn.feature_extraction.text import TfidfVectorizer # Converts a collection of text documents to a matrix of Term Frequencies and Inverse Document Frequencies (TF-IDF).

In [3]:
# nltk.download(): Downloads necessary datasets and models.

nltk.download('stopwords') # stopwords: Commonly used words in text that should be removed.
nltk.download('punkt') # punkt: A tokenizer used by NLTK to split text into words.
nltk.download('wordnet') # wordnet: A lexical database for the English language.
nltk.download('averaged_perceptron_tagger_eng') # averaged_perceptron_tagger_eng: A part-of-speech tagger for English.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [4]:
document = "My name is  Aditya Rajendra Gaikwad. We can process the text with the natural language processing is better Aditya"
# document: A sample text that will be processed. It contains a sentence with a name, common words, and repeated words to showcase text preprocessing steps.

In [5]:
tokenizer = TreebankWordTokenizer() # Initializes the tokenizer, which splits text into words (tokens) based on whitespace and punctuation.
tokens = tokenizer.tokenize(document) # Tokenizes the sample document into individual words and punctuation marks.
print(tokens)

['My', 'name', 'is', 'Aditya', 'Rajendra', 'Gaikwad.', 'We', 'can', 'process', 'the', 'text', 'with', 'the', 'natural', 'language', 'processing', 'is', 'better', 'Aditya']


In [6]:
stp_words = set(stopwords.words('english')) # stopwords.words('english'): Retrieves a list of common English stop words (e.g., "the", "is", "in"). set(stp_words): Converts the list of stopwords into a set for faster lookup.
print(stp_words)

{'been', 'was', 'be', 'does', "we'll", 'by', "shouldn't", 'on', 'against', 'where', "it'll", "shan't", "he'd", 'their', "they've", 'd', 'this', 'you', 'yourself', 'myself', 'yours', 'with', 'own', 'ourselves', 'mightn', 'weren', 'his', 'isn', 'did', 'doing', 'such', 'those', "hadn't", 'were', 'why', 'between', "doesn't", 'itself', "haven't", 'he', 'through', 'ours', 'any', 'had', 'him', 'before', "they'll", 'whom', 'further', 'from', "they'd", 'wouldn', 'then', 'again', 'off', 'there', "you'll", 'won', "weren't", 'so', 'haven', 'shouldn', "wasn't", 'it', 'that', 'am', 'now', 'of', "isn't", 'when', 'a', 'wasn', 'hadn', 'your', 'very', "i'd", 'up', 'o', 'same', "didn't", 'she', 'what', 're', "we'd", "won't", 'will', 'until', 'not', 'more', 'doesn', "you'd", 'nor', "i'm", "should've", "mightn't", 'yourselves', "she'll", "we're", 'because', 'm', "aren't", "he'll", "couldn't", 'down', 'having', 'needn', 'themselves', 'these', "she'd", 'are', "wouldn't", 'couldn', 'if', 'or', 'himself', "it'

In [7]:
pos = pos_tag(tokens) # pos_tag(tokens): Tags each word with its part of speech (e.g., noun, verb, adjective) based on the word's usage in the sentence.
print(pos)

[('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Aditya', 'NNP'), ('Rajendra', 'NNP'), ('Gaikwad.', 'NNP'), ('We', 'PRP'), ('can', 'MD'), ('process', 'VB'), ('the', 'DT'), ('text', 'NN'), ('with', 'IN'), ('the', 'DT'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('better', 'RBR'), ('Aditya', 'NNP')]


In [8]:
filtered = [word for word in tokens if word.lower() not in stp_words] # filtered: Creates a list of tokens that are not stopwords by checking if the word (converted to lowercase) is not in the stopwords list.
print(filtered)

['name', 'Aditya', 'Rajendra', 'Gaikwad.', 'process', 'text', 'natural', 'language', 'processing', 'better', 'Aditya']


In [9]:
stemmer = PorterStemmer() # PorterStemmer(): Initializes the stemmer.
stemmed_tokens = [stemmer.stem(word) for word in filtered] # stemmer.stem(word): Applies the Porter stemming algorithm to reduce words to their root form. stemmed_tokens: A list of tokens after stemming.
print(stemmed_tokens)

['name', 'aditya', 'rajendra', 'gaikwad.', 'process', 'text', 'natur', 'languag', 'process', 'better', 'aditya']


In [10]:
lemma = WordNetLemmatizer() # WordNetLemmatizer(): Initializes the lemmatizer.
lemma_tokens = [lemma.lemmatize(word) for word in filtered] # lemma.lemmatize(word): Applies the WordNet lemmatization algorithm to reduce words to their base form. lemma_tokens: A list of lemmatized tokens.
print(lemma_tokens)

['name', 'Aditya', 'Rajendra', 'Gaikwad.', 'process', 'text', 'natural', 'language', 'processing', 'better', 'Aditya']


In [11]:
process_tokens = " ".join(lemma_tokens) # " ".join(lemma_tokens): Joins the lemmatized tokens back into a single string, forming the preprocessed document.
print(process_tokens)

name Aditya Rajendra Gaikwad. process text natural language processing better Aditya


In [12]:
documents = [process_tokens] # documents: A list containing the preprocessed document (in this case, only one document). This is used for computing the TF-IDF values.

In [13]:
vector = TfidfVectorizer() # TfidfVectorizer(): Initializes the TF-IDF vectorizer, which converts a collection of text documents into a matrix of TF-IDF features.
tfidf_mat = vector.fit_transform(documents) # vector.fit_transform(documents): Computes the TF-IDF matrix for the provided documents.
name = vector.get_feature_names_out() # vector.get_feature_names_out(): Retrieves the list of feature names (words) from the TF-IDF model.
value = tfidf_mat.toarray() # tfidf_mat.toarray(): Converts the TF-IDF matrix into a NumPy array.
for word,val in zip(name,value[0]):
    print(f"feature {word} : value {val:.4f}")

    #zip(name, value[0]): Pairs each word (feature) with its corresponding TF-IDF value. print(f"feature {word} : value {val:.4f}"): Prints each feature (word) and its corresponding TF-IDF value, rounded to 4 decimal places. This helps understand the significance of each word in the document.

feature aditya : value 0.5547
feature better : value 0.2774
feature gaikwad : value 0.2774
feature language : value 0.2774
feature name : value 0.2774
feature natural : value 0.2774
feature process : value 0.2774
feature processing : value 0.2774
feature rajendra : value 0.2774
feature text : value 0.2774
