# **NLP Text Preprocessing**

In [26]:
import pandas as pd

In [27]:
df = pd.read_csv('Reviews.csv', usecols=['ProductId', 'Summary', 'Text'])
df.head()

Unnamed: 0,ProductId,Summary,Text
0,B001E4KFG0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,Great taffy,Great taffy at a great price. There was a wid...


## **Lowercasing**

In [28]:
df['ProductId'] = df['ProductId'].str.lower()
df['Summary'] = df['Summary'].str.lower()
df['Text'] = df['Text'].str.lower()

In [29]:
df.head()

Unnamed: 0,ProductId,Summary,Text
0,b001e4kfg0,good quality dog food,i have bought several of the vitality canned d...
1,b00813grg4,not as advertised,product arrived labeled as jumbo salted peanut...
2,b000lqoch0,"""delight"" says it all",this is a confection that has been around a fe...
3,b000ua0qiq,cough medicine,if you are looking for the secret ingredient i...
4,b006k2zz7k,great taffy,great taffy at a great price. there was a wid...


## **Removing html tags**

In [30]:
import re

def remove_html_tags(text):
    clean_text = re.sub('<.*?>', '', text)
    return clean_text

In [31]:
df['Text'] = df['Text'].apply(remove_html_tags)

In [32]:
df.head()

Unnamed: 0,ProductId,Summary,Text
0,b001e4kfg0,good quality dog food,i have bought several of the vitality canned d...
1,b00813grg4,not as advertised,product arrived labeled as jumbo salted peanut...
2,b000lqoch0,"""delight"" says it all",this is a confection that has been around a fe...
3,b000ua0qiq,cough medicine,if you are looking for the secret ingredient i...
4,b006k2zz7k,great taffy,great taffy at a great price. there was a wid...


## **Removing URL's**

In [33]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    clean_text = re.sub(url_pattern, '', text)
    return clean_text

In [34]:
df['Text'] = df['Text'].apply(remove_urls)

In [35]:
df.head()

Unnamed: 0,ProductId,Summary,Text
0,b001e4kfg0,good quality dog food,i have bought several of the vitality canned d...
1,b00813grg4,not as advertised,product arrived labeled as jumbo salted peanut...
2,b000lqoch0,"""delight"" says it all",this is a confection that has been around a fe...
3,b000ua0qiq,cough medicine,if you are looking for the secret ingredient i...
4,b006k2zz7k,great taffy,great taffy at a great price. there was a wid...


## **Removing Punctuations**

In [36]:
import string

def remove_punctuation(text):
    punctuation = string.punctuation
    clean_text = text.translate(str.maketrans('', '', punctuation))
    return clean_text

In [37]:
df['Text'] = df['Text'].apply(remove_punctuation)

In [38]:
df.head()

Unnamed: 0,ProductId,Summary,Text
0,b001e4kfg0,good quality dog food,i have bought several of the vitality canned d...
1,b00813grg4,not as advertised,product arrived labeled as jumbo salted peanut...
2,b000lqoch0,"""delight"" says it all",this is a confection that has been around a fe...
3,b000ua0qiq,cough medicine,if you are looking for the secret ingredient i...
4,b006k2zz7k,great taffy,great taffy at a great price there was a wide...


## **Chat Word Treatment**

In [39]:
chat_words_mapping = {
    "lol": "laughing out loud",
    "brb": "be right back",
    "btw": "by the way",
    "afk": "away from keyboard",
    "rofl": "rolling on the floor laughing",
    "ttyl": "talk to you later",
    "np": "no problem",
    "thx": "thanks",
    "omg": "oh my god",
    "idk": "I don't know",
    "np": "no problem",
    "gg": "good game",
    "g2g": "got to go",
    "b4": "before",
    "cu": "see you",
    "yw": "you're welcome",
    "wtf": "what the f*ck",
    "imho": "in my humble opinion",
    "jk": "just kidding",
    "gf": "girlfriend",
    "bf": "boyfriend",
    "u": "you",
    "r": "are",
    "2": "to",
    "4": "for",
    "b": "be",
    "c": "see",
    "y": "why",
    "tho": "though",
    "smh": "shaking my head",
    "lolz": "laughing out loud",
    "h8": "hate",
    "luv": "love",
    "pls": "please",
    "sry": "sorry",
    "tbh": "to be honest",
    "omw": "on my way",
    "omw2syg": "on my way to see your girlfriend",
}

def expand_chat_words(text):
    words = text.split()
    expanded_words = [chat_words_mapping.get(word.lower(), word) for word in words]
    return ' '.join(expanded_words)

df['Text'] = df['Text'].apply(expand_chat_words)

In [40]:
df.head()

Unnamed: 0,ProductId,Summary,Text
0,b001e4kfg0,good quality dog food,i have bought several of the vitality canned d...
1,b00813grg4,not as advertised,product arrived labeled as jumbo salted peanut...
2,b000lqoch0,"""delight"" says it all",this is a confection that has been around a fe...
3,b000ua0qiq,cough medicine,if you are looking for the secret ingredient i...
4,b006k2zz7k,great taffy,great taffy at a great price there was a wide ...


## **Spelling Correction**

In [41]:
from textblob import TextBlob

def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

text_with_spelling_errors = "Thsi is an exmaple of speling corrction."
corrected_text = correct_spelling(text_with_spelling_errors)
print(corrected_text)

His is an example of spelling correction.


In [42]:
from spellchecker import SpellChecker

def correct_spelling(text):
    spell = SpellChecker()
    corrected_words = []
    for word in text.split():
        corrected_word = spell.correction(word)
        corrected_words.append(corrected_word if corrected_word is not None else word)
    corrected_text = ' '.join(corrected_words)
    return corrected_text

text_with_spelling_errors = "Thsi is an exmaple of speling corrction."
corrected_text = correct_spelling(text_with_spelling_errors)
print(corrected_text)

this is an example of spelling corrections


## **Removing Stop Words**

In [43]:
import nltk
from nltk.corpus import stopwords

def remove_stop_words(text):
	tokens = nltk.word_tokenize(text)
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [token for token in tokens if token not in stop_words]
	preprocessed_text = ' '.join(filtered_tokens)
	return preprocessed_text

In [44]:
input_text = "This is an example sentence."
preprocessed_text = remove_stop_words(input_text)
print(preprocessed_text)

This example sentence .


In [45]:
df['Text'] = df['Text'].apply(remove_stop_words)

In [46]:
df.head()

Unnamed: 0,ProductId,Summary,Text
0,b001e4kfg0,good quality dog food,bought several vitality canned dog food produc...
1,b00813grg4,not as advertised,product arrived labeled jumbo salted peanutsth...
2,b000lqoch0,"""delight"" says it all",confection around centuries light pillowy citr...
3,b000ua0qiq,cough medicine,looking secret ingredient robitussin believe f...
4,b006k2zz7k,great taffy,great taffy great price wide assortment yummy ...


## **Handling Emogis**

### **Removing these emogis**

In [47]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"
                            u"\U0001F300-\U0001F5FF"
                            u"\U0001F680-\U0001F6FF"
                            u"\U0001F1E0-\U0001F1FF"
                            u"\U00002500-\U00002BEF"
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"
                            u"\u3030"
                            "]+", flags=re.UNICODE)
    cleaned_text = emoji_pattern.sub(r'', text)
    return cleaned_text

text_with_emojis = "I love 🐱 and 🐶"
text_without_emojis = remove_emojis(text_with_emojis)
print(text_without_emojis)

I love  and 


### **Replacing emogis with its meaning**

In [48]:
import emoji

def replace_emojis_with_meanings(text):
    def replace(match):
        emoji_char = match.group()
        emoji_meaning = emoji.demojize(emoji_char)
        return emoji_meaning

    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"
                            u"\U0001F300-\U0001F5FF"
                            u"\U0001F680-\U0001F6FF"
                            u"\U0001F1E0-\U0001F1FF"
                            u"\U00002500-\U00002BEF"
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"
                            u"\u3030"
                            "]+", flags=re.UNICODE)
    text_with_meanings = emoji_pattern.sub(replace, text)
    return text_with_meanings

text_with_emojis = "I love 🐱 and 🐶"
text_with_meanings = replace_emojis_with_meanings(text_with_emojis)
print(text_with_meanings)

I love :cat_face: and :dog_face:


## **Tokenization**

### **Word Tokenization**

In [51]:
def word_tokenization(text):
    return nltk.word_tokenize(text)

print(word_tokenization(df['Summary'][0]))

['good', 'quality', 'dog', 'food']


In [52]:
df.head()

Unnamed: 0,ProductId,Summary,Text
0,b001e4kfg0,good quality dog food,bought several vitality canned dog food produc...
1,b00813grg4,not as advertised,product arrived labeled jumbo salted peanutsth...
2,b000lqoch0,"""delight"" says it all",confection around centuries light pillowy citr...
3,b000ua0qiq,cough medicine,looking secret ingredient robitussin believe f...
4,b006k2zz7k,great taffy,great taffy great price wide assortment yummy ...


### **Sentence Tokenization**

In [53]:
def sentence_tokenization(text):
    return nltk.sent_tokenize(text)

df['Text'] = df['Text'].apply(sentence_tokenization)

In [55]:
df.head()

Unnamed: 0,ProductId,Summary,Text
0,b001e4kfg0,good quality dog food,[bought several vitality canned dog food produ...
1,b00813grg4,not as advertised,[product arrived labeled jumbo salted peanutst...
2,b000lqoch0,"""delight"" says it all",[confection around centuries light pillowy cit...
3,b000ua0qiq,cough medicine,[looking secret ingredient robitussin believe ...
4,b006k2zz7k,great taffy,[great taffy great price wide assortment yummy...


### **Custom Tokenization**

In [54]:
def custom_tokenization(text, method='word'):
    if method == 'word':
        tokens = re.findall(r'\b\w+\b', text)
    elif method == 'sentence':
        tokens = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    elif method == 'char':
        tokens = list(text)
    else:
        raise ValueError("Invalid tokenization method. Choose one of 'word', 'sentence', or 'char'.")

    return tokens

text = "Tokenization is an important step in NLP."
word_tokens = custom_tokenization(text, method='word')
sentence_tokens = custom_tokenization(text, method='sentence')
char_tokens = custom_tokenization(text, method='char')

print("Word tokens:", word_tokens)
print("Sentence tokens:", sentence_tokens)
print("Character tokens:", char_tokens)

Word tokens: ['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP']
Sentence tokens: ['Tokenization is an important step in NLP.']
Character tokens: ['T', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'i', 's', ' ', 'a', 'n', ' ', 'i', 'm', 'p', 'o', 'r', 't', 'a', 'n', 't', ' ', 's', 't', 'e', 'p', ' ', 'i', 'n', ' ', 'N', 'L', 'P', '.']


### **Tokenization using Spacy Libarary**

In [58]:
import spacy

def custom_tokenization(text, method='word'):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    
    if method == 'word':
        # Word tokenization
        tokens = [token.text for token in doc]
    elif method == 'sentence':
        # Sentence tokenization
        tokens = [sent.text for sent in doc.sents]
    elif method == 'char':
        # Character tokenization
        tokens = list(text)
    else:
        raise ValueError("Invalid tokenization method. Choose one of 'word', 'sentence', or 'char'.")

    return tokens

text = "Tokenization is an important step in NLP."
word_tokens = custom_tokenization(text, method='word')
sentence_tokens = custom_tokenization(text, method='sentence')
char_tokens = custom_tokenization(text, method='char')

print("Word tokens:", word_tokens)
print("Sentence tokens:", sentence_tokens)
print("Character tokens:", char_tokens)

Word tokens: ['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP', '.']
Sentence tokens: ['Tokenization is an important step in NLP.']
Character tokens: ['T', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'i', 's', ' ', 'a', 'n', ' ', 'i', 'm', 'p', 'o', 'r', 't', 'a', 'n', 't', ' ', 's', 't', 'e', 'p', ' ', 'i', 'n', ' ', 'N', 'L', 'P', '.']
