In [35]:
# Import libraries
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import contractions
import pickle
from nltk.tokenize.treebank import TreebankWordTokenizer

In [36]:
# Download NLTK resources (only once needed)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
# Step 1: Load the messy data
# Read the CSV with UTF-8 encoding
df = pd.read_csv('D:/Nitte_NLP/nlp-basics/Day-1/2_Text-preprocessong/text_preprocessing.csv', encoding='utf-8')

# Show the first 5 rows
print(df)

                                                text
0   Hey THERE!!! How's everything going? 😃👍 #excited
1  I'm loving this new phone... It's AMAZING!!! P...
2  She said, 'I'll be there at 5pm!!' But she arr...
3  Let's test NLP's ability to clean texts: email...
4  Best day everrrrr!!! Gonna remember it forever...
5  Can't believe it's already 2025... Time flies! 🕒🚀
6         Ewwww, that was soooo gross 🤢🤮 #neveragain
7  Visit us at www.awesome-place.com or call 1800...
8  OMG!!! 😱😱 Such an unexpected turn of events......
9  Happy birthdayyyyyyy!!!! 🎂🎉🎈 Wishing you lots ...


In [38]:
# Step 2: Lowercasing
df['text_lower'] = df['text'].str.lower()
df

Unnamed: 0,text,text_lower
0,Hey THERE!!! How's everything going? 😃👍 #excited,hey there!!! how's everything going? 😃👍 #excited
1,I'm loving this new phone... It's AMAZING!!! P...,i'm loving this new phone... it's amazing!!! p...
2,"She said, 'I'll be there at 5pm!!' But she arr...","she said, 'i'll be there at 5pm!!' but she arr..."
3,Let's test NLP's ability to clean texts: email...,let's test nlp's ability to clean texts: email...
4,Best day everrrrr!!! Gonna remember it forever...,best day everrrrr!!! gonna remember it forever...
5,Can't believe it's already 2025... Time flies! 🕒🚀,can't believe it's already 2025... time flies! 🕒🚀
6,"Ewwww, that was soooo gross 🤢🤮 #neveragain","ewwww, that was soooo gross 🤢🤮 #neveragain"
7,Visit us at www.awesome-place.com or call 1800...,visit us at www.awesome-place.com or call 1800...
8,OMG!!! 😱😱 Such an unexpected turn of events......,omg!!! 😱😱 such an unexpected turn of events......
9,Happy birthdayyyyyyy!!!! 🎂🎉🎈 Wishing you lots ...,happy birthdayyyyyyy!!!! 🎂🎉🎈 wishing you lots ...


In [39]:
# Step 3: Expand contractions
df['text_no_contractions'] = df['text_lower'].apply(lambda x: contractions.fix(x))
print("\nAfter Expanding Contractions:")
print(df['text_no_contractions'].head())


After Expanding Contractions:
0    hey there!!! how is everything going? 😃👍 #excited
1    i am loving this new phone... it is amazing!!!...
2    she said, 'i will be there at 5pm!!' but she a...
3    let us test nlp's ability to clean texts: emai...
4    best day everrrrr!!! going to remember it fore...
Name: text_no_contractions, dtype: object


In [40]:
# Step 4: Remove punctuation
df['text_no_punct'] = df['text_no_contractions'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
print("\nAfter Removing Punctuation:")
print(df['text_no_punct'].head())


After Removing Punctuation:
0         hey there how is everything going 😃👍 excited
1    i am loving this new phone it is amazing price...
2    she said i will be there at 5pm but she arrive...
3    let us test nlps ability to clean texts emails...
4    best day everrrrr going to remember it forever...
Name: text_no_punct, dtype: object


In [41]:
# Step 5: Remove digits
df['text_no_digits'] = df['text_no_punct'].apply(lambda x: re.sub(r'\d+', '', x))
print("\nAfter Removing Digits:")
print(df['text_no_digits'].head())


After Removing Digits:
0         hey there how is everything going 😃👍 excited
1    i am loving this new phone it is amazing price  🤳
2    she said i will be there at pm but she arrived...
3    let us test nlps ability to clean texts emails...
4    best day everrrrr going to remember it forever...
Name: text_no_digits, dtype: object


In [50]:
# Step 6: Remove emojis and URL 
# 1. Emoji pattern (expanded correctly including all missing blocks)
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002700-\U000027BF"
                           u"\U0001F900-\U0001F9FF"
                           u"\U0001FA70-\U0001FAFF"
                           u"\U00002600-\U000026FF"
                           u"\U00002300-\U000023FF"
                           "]+", flags=re.UNICODE)

# 2. URL pattern
url_pattern = re.compile(r'https?://\S+|www\.\S+')

# Step 1: Remove URLs first
df['text_no_urls'] = df['text_no_digits'].apply(lambda x: url_pattern.sub(r'', x))

# Step 2: Then remove emojis
df['text_no_emojis'] = df['text_no_urls'].apply(lambda x: emoji_pattern.sub(r'', x))

In [51]:
# Step 7: Tokenization (Word-level)
# (Optional) You can skip this if you already know stopwords are needed later
nltk.download('stopwords')

# Manually load 'punkt' tokenizer (optional, only for sentence tokenization if needed)
# PUNKT_PATH = r'C:\Users\Admin\AppData\Roaming\nltk_data\tokenizers\punkt\english.pickle'
# with open(PUNKT_PATH, 'rb') as f:
#     sent_tokenizer = pickle.load(f)

# Initialize Treebank Word Tokenizer
word_tokenizer = TreebankWordTokenizer()

# Perform word tokenization
df['word_tokens'] = df['text_no_emojis'].apply(lambda x: word_tokenizer.tokenize(x))

print("\nAfter Word Tokenization:")
print(df['word_tokens'].head())


After Word Tokenization:
0    [hey, there, how, is, everything, going, excited]
1    [i, am, loving, this, new, phone, it, is, amaz...
2    [she, said, i, will, be, there, at, pm, but, s...
3    [let, us, test, nlps, ability, to, clean, text...
4    [best, day, everrrrr, going, to, remember, it,...
Name: word_tokens, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
# Step 8: Stopword Removal
stop_words = set(stopwords.words('english'))
df['tokens_no_stopwords'] = df['word_tokens'].apply(lambda tokens: [w for w in tokens if w not in stop_words])
print("\nAfter Stopword Removal:")
print(df['tokens_no_stopwords'].head())


After Stopword Removal:
0                    [hey, everything, going, excited]
1                 [loving, new, phone, amazing, price]
2                                  [said, pm, arrived]
3    [let, us, test, nlps, ability, clean, texts, e...
4    [best, day, everrrrr, going, remember, forever...
Name: tokens_no_stopwords, dtype: object


In [53]:
# Step 9: N-grams (bigram example)
from nltk import ngrams

def generate_bigrams(tokens):
    return list(ngrams(tokens, 2))

df['bigrams'] = df['tokens_no_stopwords'].apply(generate_bigrams)
print("\nAfter Generating Bigrams:")
print(df['bigrams'].head())


After Generating Bigrams:
0    [(hey, everything), (everything, going), (goin...
1    [(loving, new), (new, phone), (phone, amazing)...
2                          [(said, pm), (pm, arrived)]
3    [(let, us), (us, test), (test, nlps), (nlps, a...
4    [(best, day), (day, everrrrr), (everrrrr, goin...
Name: bigrams, dtype: object


In [54]:
# Step 10: Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

df['lemmatized_tokens'] = df['tokens_no_stopwords'].apply(lemmatize_tokens)
print("\nAfter Lemmatization:")
print(df['lemmatized_tokens'].head())


After Lemmatization:
0                    [hey, everything, going, excited]
1                 [loving, new, phone, amazing, price]
2                                  [said, pm, arrived]
3    [let, u, test, nlp, ability, clean, text, emai...
4    [best, day, everrrrr, going, remember, forever...
Name: lemmatized_tokens, dtype: object


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [55]:
# Step 11: Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

df['stemmed_tokens'] = df['tokens_no_stopwords'].apply(stem_tokens)
print("\nAfter Stemming:")
print(df['stemmed_tokens'].head())


After Stemming:
0                            [hey, everyth, go, excit]
1                      [love, new, phone, amaz, price]
2                                    [said, pm, arriv]
3    [let, us, test, nlp, abil, clean, text, email,...
4    [best, day, everrrrr, go, rememb, forev, bless...
Name: stemmed_tokens, dtype: object


In [56]:
df.to_csv('D:/Nitte_NLP/nlp-basics/Day-1/2_Text-preprocessong/cleaned_text.csv')

Assignment - What is the difference between stemming and Lemmatization. where each should be applied?