In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample document
document = "Tokenization is the process of breaking down a document into individual words or tokens. POS tagging assigns a part of speech tag to each token. Stop words are commonly used words that are often removed. Stemming reduces words to their root or base form. Lemmatization finds the base or dictionary form of a word."

# Tokenization
tokens = word_tokenize(document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# Print the results
print("Original Document:\n", document)
print("\nTokenization:\n", tokens)
print("\nPOS Tagging:\n", pos_tags)
print("\nStop Words Removal:\n", filtered_tokens)
print("\nStemming:\n", stemmed_tokens)
print("\nLemmatization:\n", lemmatized_tokens)

# Calculate TF-IDF representation of documents
documents = [document]  # List of documents
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Print the TF-IDF representation
feature_names = vectorizer.get_feature_names()
tfidf_representation = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print("\nTF-IDF Representation:")
print(tfidf_representation)
