In [3]:
pip install nltk



In [10]:
import nltk
import re
import numpy as np
import pandas as pd

# Download necessary NLTK data files (only needs to run once)
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Define a function to map POS tags from nltk's format to WordNet format
def get_wordnet_pos(nltk_pos_tag):
    """Map POS tag to first character lemmatize() accepts"""
    if nltk_pos_tag.startswith('J'):
        return 'a'  # adjective
    elif nltk_pos_tag.startswith('V'):
        return 'v'  # verb
    elif nltk_pos_tag.startswith('N'):
        return 'n'  # noun
    elif nltk_pos_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return None

def preprocess_document(text):
    """
    This function takes a text string and performs:
     - Tokenization
     - POS Tagging
     - Stop words removal
     - Stemming
     - Lemmatization

    It returns a dictionary containing the results after each processing step.
    """
    results = {}

    # --- Tokenization ---
    tokens = word_tokenize(text)
    results['tokens'] = tokens

    # --- POS Tagging ---
    pos_tags = pos_tag(tokens)
    results['pos_tags'] = pos_tags

    # --- Stop Words Removal ---
    stop_words = set(stopwords.words('english'))
    tokens_no_stop = [token for token in tokens if token.lower() not in stop_words]
    results['tokens_no_stop'] = tokens_no_stop

    # --- Stemming ---
    stemmer = PorterStemmer()
    tokens_stemmed = [stemmer.stem(token) for token in tokens_no_stop]
    results['tokens_stemmed'] = tokens_stemmed

    # --- Lemmatization ---
    lemmatizer = WordNetLemmatizer()
    # For lemmatization we use the POS tags of the tokens (from the stop-words removed list)
    # We perform POS tagging again on tokens_no_stop to get more accurate mapping.
    pos_tags_no_stop = pos_tag(tokens_no_stop)
    tokens_lemmatized = []
    for token, tag in pos_tags_no_stop:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag is None:
            tokens_lemmatized.append(lemmatizer.lemmatize(token))
        else:
            tokens_lemmatized.append(lemmatizer.lemmatize(token, pos=wn_tag))
    results['tokens_lemmatized'] = tokens_lemmatized

    return results

# Sample document for preprocessing
sample_doc = (
    "Natural Language Processing (NLP) is an exciting field that explores the intersection "
    "of linguistics and computer science. Through techniques such as tokenization, stemming, "
    "and lemmatization, machines can understand human language."
)

# Preprocess the sample document
processed = preprocess_document(sample_doc)

# Print results of each step
print("Original Tokens:")
print(processed['tokens'])
print("\nPOS Tags:")
print(processed['pos_tags'])
print("\nTokens after Stop Words Removal:")
print(processed['tokens_no_stop'])
print("\nTokens after Stemming:")
print(processed['tokens_stemmed'])
print("\nTokens after Lemmatization:")
print(processed['tokens_lemmatized'])

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Original Tokens:
['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'an', 'exciting', 'field', 'that', 'explores', 'the', 'intersection', 'of', 'linguistics', 'and', 'computer', 'science', '.', 'Through', 'techniques', 'such', 'as', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', ',', 'machines', 'can', 'understand', 'human', 'language', '.']

POS Tags:
[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('an', 'DT'), ('exciting', 'JJ'), ('field', 'NN'), ('that', 'WDT'), ('explores', 'VBZ'), ('the', 'DT'), ('intersection', 'NN'), ('of', 'IN'), ('linguistics', 'NNS'), ('and', 'CC'), ('computer', 'NN'), ('science', 'NN'), ('.', '.'), ('Through', 'IN'), ('techniques', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), (',', ','), ('machines', 'NNS'), ('can', 'MD'), ('understand', 'VB'), ('human', 'JJ'), (

In [11]:
# For TF-IDF, we need a corpus (a collection of documents).
# Here we create a small sample corpus.
corpus = [
    "Natural language processing is an interdisciplinary field that deals with many aspects of language and computer science.",
    "Machine learning techniques have dramatically improved the field of natural language processing.",
    "In this class, we explore the foundations of NLP along with the principles of data science."
]

def clean_document_for_tfidf(text):
    processed_text = preprocess_document(text)
    # Join the lemmatized tokens into a single string
    return " ".join(processed_text['tokens_lemmatized'])

clean_corpus = [clean_document_for_tfidf(doc) for doc in corpus]

# Now, we use scikit-learn's TfidfVectorizer to create a TF-IDF representation of the documents.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(clean_corpus)

# Display the feature names (i.e., terms) and the TF-IDF matrix
print("\nTF-IDF Feature Names:")
print(vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


TF-IDF Feature Names:
['along' 'aspect' 'class' 'computer' 'data' 'deal' 'dramatically'
 'explore' 'field' 'foundation' 'improve' 'interdisciplinary' 'language'
 'learn' 'machine' 'many' 'natural' 'nlp' 'principle' 'processing'
 'science' 'technique']

TF-IDF Matrix:
[[0.         0.32229243 0.         0.32229243 0.         0.32229243
  0.         0.         0.2451117  0.         0.         0.32229243
  0.4902234  0.         0.         0.32229243 0.2451117  0.
  0.         0.2451117  0.2451117  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.36977238 0.         0.28122142 0.         0.36977238 0.
  0.28122142 0.36977238 0.36977238 0.         0.28122142 0.
  0.         0.28122142 0.         0.36977238]
 [0.36325471 0.         0.36325471 0.         0.36325471 0.
  0.         0.36325471 0.         0.36325471 0.         0.
  0.         0.         0.         0.         0.         0.36325471
  0.36325471 0.         0.27626457 0.        ]]
