In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from math import log
import pandas as pd

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sample document
documents = [
    "Natural Language Processing helps computers understand human language.",
    "Document processing includes various tasks such as tokenization and lemmatization.",
    "Stemming is a technique used in text preprocessing."
]

# Preprocessing Functions
def preprocess_document(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # POS Tagging
    pos_tags = pos_tag(tokens)
    
    # Stop Words Removal
    stop_words = set(stopwords.words("english"))
    filtered = [w for w in tokens if w.lower() not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in filtered]
    
    # Lemmatization with POS tagging
    lemmatizer = WordNetLemmatizer()

    # Function to map POS tag to WordNet POS
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return 'a'  # Adjective
        elif treebank_tag.startswith('V'):
            return 'v'  # Verb
        elif treebank_tag.startswith('N'):
            return 'n'  # Noun
        elif treebank_tag.startswith('R'):
            return 'r'  # Adverb
        else:
            return 'n'  # Default to noun

    lemmatized = [
        lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in pos_tags
    ]
    
    return {
        "tokens": tokens,
        "pos_tags": pos_tags,
        "filtered_tokens": filtered,
        "stemmed_tokens": stemmed,
        "lemmatized_tokens": lemmatized
    }

# Process each document
processed_documents = [preprocess_document(doc) for doc in documents]

# Manually calculate Term Frequency (TF)
def calculate_tf(document):
    tokens = word_tokenize(document)
    token_count = len(tokens)
    
    word_counts = {}
    for word in tokens:
        word_counts[word] = word_counts.get(word, 0) + 1

    # Calculate TF for each word
    tf_values = {word: count / token_count for word, count in word_counts.items()}
    return tf_values

# Calculate Inverse Document Frequency (IDF)
def calculate_idf(documents):
    N = len(documents)  # Total number of documents
    word_doc_count = {}
    
    # Count how many documents contain each word
    for doc in documents:
        words_in_doc = set(word_tokenize(doc))  # Unique words in each document
        for word in words_in_doc:
            word_doc_count[word] = word_doc_count.get(word, 0) + 1

    # Calculate IDF for each word
    idf_values = {}
    for word, count in word_doc_count.items():
        idf_values[word] = log(N / (1 + count))  # +1 to avoid division by zero

    return idf_values

# Output Preprocessed Results for the first document
print("Original Document:")
print(documents[0])

print("\nTokenized:")
print(processed_documents[0]["tokens"])

print("\nPOS Tags:")
print(processed_documents[0]["pos_tags"])

print("\nFiltered (Stopwords Removed):")
print(processed_documents[0]["filtered_tokens"])

print("\nStemmed Tokens:")
print(processed_documents[0]["stemmed_tokens"])

print("\nLemmatized Tokens:")
print(processed_documents[0]["lemmatized_tokens"])

# Calculate Term Frequency (TF) for all documents
print("\nTerm Frequency (TF) for each document:")
for i, doc in enumerate(documents):
    tf_values = calculate_tf(doc)
    print(f"\nDocument {i+1}:")
    for word, tf in tf_values.items():
        print(f"{word}: {tf:.4f}")

# Calculate Inverse Document Frequency (IDF)
idf_values = calculate_idf(documents)
print("\nInverse Document Frequency (IDF):")
for word, idf in idf_values.items():
    print(f"{word}: {idf:.4f}")

# Calculate the final TF-IDF values for each document
print("\nTF-IDF for each document:")
for i, doc in enumerate(documents):
    tf_values = calculate_tf(doc)
    print(f"\nDocument {i+1}:")
    for word in tf_values:
        tfidf_value = tf_values[word] * idf_values.get(word, 0)
        print(f"{word}: {tfidf_value:.4f}")


Original Document:
Natural Language Processing helps computers understand human language.

Tokenized:
['Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', 'human', 'language', '.']

POS Tags:
[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('helps', 'VBZ'), ('computers', 'NNS'), ('understand', 'VBP'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]

Filtered (Stopwords Removed):
['Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', 'human', 'language', '.']

Stemmed Tokens:
['natur', 'languag', 'process', 'help', 'comput', 'understand', 'human', 'languag', '.']

Lemmatized Tokens:
['Natural', 'Language', 'Processing', 'help', 'computer', 'understand', 'human', 'language', '.']

Term Frequency (TF) for each document:

Document 1:
Natural: 0.1111
Language: 0.1111
Processing: 0.1111
helps: 0.1111
computers: 0.1111
understand: 0.1111
human: 0.1111
language: 0.1111
.: 0.1111

Document 2:
Document: 0.0909
processing: 0.0909
includes: 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user5\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
