In [None]:
# Import the Natural Language Toolkit (NLTK) library
# NLTK is a powerful Python library for natural language processing tasks
import nltk
nltk.download('punkt', download_dir='./nltk_data')

In [None]:
# This is the opening line from Jane Austen's Pride and Prejudice
# The text will be used for natural language processing examples
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune,  must be in want of a wife." 
text = text.lower() 
print(text) 

In [None]:
# This is the opening line from Jane Austen's Pride and Prejudice (1813)
# The text will be used as a sample for natural language processing tasks
# The sentence is a famous example of irony and social commentary in literature
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune,  must be in want of a wife." 
text = text.lower() 
print(text)

In [None]:
# Import the string module which provides constants for ASCII characters
# This module is particularly useful for handling punctuation and other special characters
import string 
print(string.punctuation)

In [None]:
# Remove all punctuation marks from the text using string.punctuation
# This creates a clean version of the text without any special characters
text_p = "".join([char for char in text if char not in string.punctuation]); print(text_p)

In [None]:
# Import the Natural Language Toolkit (NLTK) library
# NLTK is a comprehensive Python library for natural language processing
# It provides tools for tokenization, stemming, tagging, parsing, and more
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [7]:
# Import word_tokenize and sent_tokenize from NLTK's tokenize module
# word_tokenize splits text into individual words
# sent_tokenize splits text into sentences
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
#1-- Tokenize the cleaned text (text_p) into individual words using NLTK's word_tokenize
# This splits the text into a list of words, handling contractions and special cases
words = word_tokenize(text_p)
words1 = sent_tokenize(text_p)
print(words)
print(words1)

In [None]:
# 1--Import NLTK library for natural language processing tasks
import nltk

# Import word_tokenize and sent_tokenize for splitting text into words and sentences
from nltk.tokenize import word_tokenize, sent_tokenize

# Import stopwords to remove common words that don't add meaning (e.g., 'the', 'is', 'at')
from nltk.corpus import stopwords

# Import string module for handling punctuation and special characters
import string

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Sample text
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."

# Remove punctuation
text_p = "".join([char for char in text if char not in string.punctuation])

# Tokenize
words = word_tokenize(text_p)
sentences = sent_tokenize(text_p)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]

# Print results
print("Words:", words)
print("Filtered Words:", filtered_words)


In [None]:
#1-- Filter out stopwords from the tokenized words list
# This creates a new list containing only meaningful words by removing common words like 'the', 'is', 'at'
# The list comprehension checks each word against the stop_words set and only keeps non-stopwords
filtered_words = [word for word in words if word not in stop_words] 
print(filtered_words)

In [None]:
#1--- Import PorterStemmer from NLTK's stem package
# PorterStemmer is a stemming algorithm that reduces words to their root form
# For example: "running" -> "run", "jumps" -> "jump"
from nltk.stem.porter import PorterStemmer 
porter = PorterStemmer() 
stemmed = [porter.stem(word) for word in filtered_words] 
print(stemmed)

In [None]:
# Import Natural Language Toolkit (NLTK) library
# NLTK is a leading platform for building Python programs to work with human language data
# It provides easy-to-use interfaces to over 50 corpora and lexical resources
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
# 1--Import pos_tag from NLTK's tag package
# pos_tag is a part-of-speech tagger that assigns grammatical categories to words
# For example: "run" -> "VB" (verb), "cat" -> "NN" (noun)
from nltk import pos_tag
pos = pos_tag(filtered_words) 
print(pos)

In [None]:
# Import WordNetLemmatizer for lemmatization
# Lemmatization is the process of reducing words to their base or dictionary form (lemma)
# Unlike stemming which just chops off word endings, lemmatization considers the context and converts words to their meaningful base form
# For example: "better" -> "good", "running" -> "run", "mice" -> "mouse"
# WordNetLemmatizer uses WordNet's built-in morphy function to find the lemma of a word
# This is more accurate than stemming but requires more computational resources
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized Words:", lemmatized)


In [None]:
# Import TfidfVectorizer from scikit-learn's text feature extraction module
# This class implements the TF-IDF (Term Frequency-Inverse Document Frequency) algorithm
# TF-IDF is a numerical statistic that reflects how important a word is to a document in a collection
# It combines two metrics:
# 1. Term Frequency (TF): How often a word appears in a document
# 2. Inverse Document Frequency (IDF): How important the word is across all documents
from sklearn.feature_extraction.text import TfidfVectorizer

# Import TfidfVectorizer again (this line appears to be redundant and can be removed)
# The same class is already imported above
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a corpus by joining the filtered words into a single string
# The corpus is a collection of documents, here we're using a single document
# " ".join() concatenates all words with spaces in between
corpus = [" ".join(filtered_words)]

# Initialize the TfidfVectorizer object
# This will be used to convert the text data into TF-IDF features
# By default, it uses:
# - lowercase=True: converts all characters to lowercase
# - stop_words='english': removes common English words
# - token_pattern=r'(?u)\b\w\w+\b': pattern to extract words
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the corpus and transform the text into TF-IDF features
# fit_transform() performs two operations:
# 1. fit(): learns the vocabulary and IDF weights from the corpus
# 2. transform(): converts the text into TF-IDF features
tfidf_matrix = vectorizer.fit_transform(corpus)

# Convert the sparse matrix to a dense array and print it
# toarray() converts the sparse matrix representation to a regular numpy array
# This makes it easier to visualize the TF-IDF values
print("TF-IDF Representation:")
print(tfidf_matrix.toarray())

# Print the feature names (words) that were extracted from the corpus
# These are the words that were used to create the TF-IDF features
# get_feature_names_out() returns an array of the feature names
print("Feature Names (Words):")
print(vectorizer.get_feature_names_out())



In [None]:
# What is TF-IDF?
# TF-IDF = Term Frequency × Inverse Document Frequency

# TF (Term Frequency):
# How frequently a word appears in a document.
# Example: In the sentence "Data is important in data science", TF for "data" = 2/6

# IDF (Inverse Document Frequency):
# Measures how unique a word is across all documents.
# Rare words get higher scores than common words like "the", "is", etc.

# TF-IDF highlights important and unique words in a document.