In [1]:
sample_document = """
This is a sample document for demonstrating text preprocessing.
It includes multiple sentences and some common words like is, a, for.
We will perform tokenization, POS tagging, stop words removal, stemming, and lemmatization on this document.
The goal is to understand the basic steps involved in preparing text data for further analysis.
"""

In [2]:
import nltk
from nltk.tokenize import word_tokenize

#nltk.download('punkt') 
#nltk.download('punkt_tab')
tokens = word_tokenize(sample_document.lower()) 
print("Tokens:", tokens)

Tokens: ['this', 'is', 'a', 'sample', 'document', 'for', 'demonstrating', 'text', 'preprocessing', '.', 'it', 'includes', 'multiple', 'sentences', 'and', 'some', 'common', 'words', 'like', 'is', ',', 'a', ',', 'for', '.', 'we', 'will', 'perform', 'tokenization', ',', 'pos', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'and', 'lemmatization', 'on', 'this', 'document', '.', 'the', 'goal', 'is', 'to', 'understand', 'the', 'basic', 'steps', 'involved', 'in', 'preparing', 'text', 'data', 'for', 'further', 'analysis', '.']


In [None]:
# Download the Averaged Perceptron Tagger data
#nltk.download('averaged_perceptron_tagger_eng')
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('this', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('document', 'NN'), ('for', 'IN'), ('demonstrating', 'VBG'), ('text', 'JJ'), ('preprocessing', 'NN'), ('.', '.'), ('it', 'PRP'), ('includes', 'VBZ'), ('multiple', 'JJ'), ('sentences', 'NNS'), ('and', 'CC'), ('some', 'DT'), ('common', 'JJ'), ('words', 'NNS'), ('like', 'IN'), ('is', 'VBZ'), (',', ','), ('a', 'DT'), (',', ','), ('for', 'IN'), ('.', '.'), ('we', 'PRP'), ('will', 'MD'), ('perform', 'VB'), ('tokenization', 'NN'), (',', ','), ('pos', 'NN'), ('tagging', 'NN'), (',', ','), ('stop', 'VB'), ('words', 'NNS'), ('removal', 'JJ'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('on', 'IN'), ('this', 'DT'), ('document', 'NN'), ('.', '.'), ('the', 'DT'), ('goal', 'NN'), ('is', 'VBZ'), ('to', 'TO'), ('understand', 'VB'), ('the', 'DT'), ('basic', 'JJ'), ('steps', 'NNS'), ('involved', 'VBN'), ('in', 'IN'), ('preparing', 'VBG'), ('text', 'NN'), ('data', 'NNS'), ('for', 'IN'), ('fu

In [None]:
from nltk.corpus import stopwords

# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token not in stop_words]
print("Filtered Tokens (Stop Words Removed):", filtered_tokens)

Filtered Tokens (Stop Words Removed): ['sample', 'document', 'demonstrating', 'text', 'preprocessing', '.', 'includes', 'multiple', 'sentences', 'common', 'words', 'like', ',', ',', '.', 'perform', 'tokenization', ',', 'pos', 'tagging', ',', 'stop', 'words', 'removal', ',', 'stemming', ',', 'lemmatization', 'document', '.', 'goal', 'understand', 'basic', 'steps', 'involved', 'preparing', 'text', 'data', 'analysis', '.']


In [None]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()
stemmed_tokens = [porter_stemmer.stem(token) for token in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['sampl', 'document', 'demonstr', 'text', 'preprocess', '.', 'includ', 'multipl', 'sentenc', 'common', 'word', 'like', ',', ',', '.', 'perform', 'token', ',', 'po', 'tag', ',', 'stop', 'word', 'remov', ',', 'stem', ',', 'lemmat', 'document', '.', 'goal', 'understand', 'basic', 'step', 'involv', 'prepar', 'text', 'data', 'analysi', '.']


In [None]:
from nltk.stem import WordNetLemmatizer

# nltk.download('wordnet') 
# nltk.download('omw-1.4') 

wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [wordnet_lemmatizer.lemmatize(token, pos='n') if tag.startswith('N') else \
                     wordnet_lemmatizer.lemmatize(token, pos='v') if tag.startswith('V') else \
                     wordnet_lemmatizer.lemmatize(token, pos='a') if tag.startswith('J') else \
                     wordnet_lemmatizer.lemmatize(token, pos='r') if tag.startswith('R') else \
                     wordnet_lemmatizer.lemmatize(token)
                     for token, tag in pos_tags if token not in stop_words] # Apply lemmatization based on POS tag
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['sample', 'document', 'demonstrate', 'text', 'preprocessing', '.', 'include', 'multiple', 'sentence', 'common', 'word', 'like', ',', ',', '.', 'perform', 'tokenization', ',', 'po', 'tagging', ',', 'stop', 'word', 'removal', ',', 'stem', ',', 'lemmatization', 'document', '.', 'goal', 'understand', 'basic', 'step', 'involve', 'prepare', 'text', 'data', 'analysis', '.']


In [None]:
corpus = [
"This is a sample document for demonstrating text preprocessing.",
"It includes multiple sentences and some common words like is, a, for.",
"We will perform tokenization, POS tagging, stop words removal, stemming, and lemmatization on this document.",
"The goal is to understand the basic steps involved in preparing text data for further analysis."
]

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def preprocess(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [t for t in tokens if t not in stop_words and t not in string.punctuation]  
    tokens = [lemmatizer.lemmatize(t) for t in tokens]  
    return tokens

# Process the corpus
processed_corpus = [preprocess(doc) for doc in corpus]
print("Processed Corpus:", processed_corpus)

Processed Corpus: [['sample', 'document', 'demonstrating', 'text', 'preprocessing'], ['includes', 'multiple', 'sentence', 'common', 'word', 'like'], ['perform', 'tokenization', 'po', 'tagging', 'stop', 'word', 'removal', 'stemming', 'lemmatization', 'document'], ['goal', 'understand', 'basic', 'step', 'involved', 'preparing', 'text', 'data', 'analysis']]


In [None]:
def calculate_tf(document):
    tf_dict = {}
    for term in document:
        tf_dict[term] = tf_dict.get(term, 0) + 1
    return tf_dict

tf_representations = [calculate_tf(doc) for doc in processed_corpus]
print("Term Frequency (TF) for each document:", tf_representations)

Term Frequency (TF) for each document: [{'sample': 1, 'document': 1, 'demonstrating': 1, 'text': 1, 'preprocessing': 1}, {'includes': 1, 'multiple': 1, 'sentence': 1, 'common': 1, 'word': 1, 'like': 1}, {'perform': 1, 'tokenization': 1, 'po': 1, 'tagging': 1, 'stop': 1, 'word': 1, 'removal': 1, 'stemming': 1, 'lemmatization': 1, 'document': 1}, {'goal': 1, 'understand': 1, 'basic': 1, 'step': 1, 'involved': 1, 'preparing': 1, 'text': 1, 'data': 1, 'analysis': 1}]


In [None]:
import math

def calculate_idf(corpus):
    N = len(corpus)
    idf_dict = {}
    all_terms = set([term for doc in corpus for term in doc])
    for term in all_terms:
        df = sum(1 for doc in corpus if term in doc)
        idf_dict[term] = math.log(N / (df + 1)) # Adding 1 to avoid division by zero
    return idf_dict

idf_values = calculate_idf(processed_corpus)
print("Inverse Document Frequency (IDF) for each term:", idf_values)

Inverse Document Frequency (IDF) for each term: {'basic': 0.6931471805599453, 'includes': 0.6931471805599453, 'analysis': 0.6931471805599453, 'goal': 0.6931471805599453, 'lemmatization': 0.6931471805599453, 'understand': 0.6931471805599453, 'word': 0.28768207245178085, 'multiple': 0.6931471805599453, 'data': 0.6931471805599453, 'preprocessing': 0.6931471805599453, 'common': 0.6931471805599453, 'stop': 0.6931471805599453, 'perform': 0.6931471805599453, 'step': 0.6931471805599453, 'po': 0.6931471805599453, 'removal': 0.6931471805599453, 'preparing': 0.6931471805599453, 'involved': 0.6931471805599453, 'stemming': 0.6931471805599453, 'sentence': 0.6931471805599453, 'text': 0.28768207245178085, 'sample': 0.6931471805599453, 'like': 0.6931471805599453, 'demonstrating': 0.6931471805599453, 'tokenization': 0.6931471805599453, 'document': 0.28768207245178085, 'tagging': 0.6931471805599453}


In [None]:
def calculate_tfidf(tf_dict, idf_values):
    tfidf_dict = {}
    for term, tf in tf_dict.items():
        tfidf_dict[term] = tf * idf_values.get(term, 0)
    return tfidf_dict

tfidf_representations = [calculate_tfidf(tf, idf_values) for tf in tf_representations]
print("TF-IDF representation for each document:", tfidf_representations)

TF-IDF representation for each document: [{'sample': 0.6931471805599453, 'document': 0.28768207245178085, 'demonstrating': 0.6931471805599453, 'text': 0.28768207245178085, 'preprocessing': 0.6931471805599453}, {'includes': 0.6931471805599453, 'multiple': 0.6931471805599453, 'sentence': 0.6931471805599453, 'common': 0.6931471805599453, 'word': 0.28768207245178085, 'like': 0.6931471805599453}, {'perform': 0.6931471805599453, 'tokenization': 0.6931471805599453, 'po': 0.6931471805599453, 'tagging': 0.6931471805599453, 'stop': 0.6931471805599453, 'word': 0.28768207245178085, 'removal': 0.6931471805599453, 'stemming': 0.6931471805599453, 'lemmatization': 0.6931471805599453, 'document': 0.28768207245178085}, {'goal': 0.6931471805599453, 'understand': 0.6931471805599453, 'basic': 0.6931471805599453, 'step': 0.6931471805599453, 'involved': 0.6931471805599453, 'preparing': 0.6931471805599453, 'text': 0.28768207245178085, 'data': 0.6931471805599453, 'analysis': 0.6931471805599453}]
