### 1. Gathering Data

In [1]:
import pandas as pd 
data = {
    "text": [
        "I LOOOVE this product 😍😍!!! Highly recommended... #awesome",
        "Worst. Experience. Ever. Will NEVER buy again!!! 🤮🤬",
        "meh... it was okay, I guess. kinda boring tho 🙄",
        "ABSOLUTELY AMAZING SERVICE!!! 😍💯 www.company.com",
        "@brand You guys rock! Keep it up 👏🔥🔥🔥",
        "Totally disappointed. Delivery late, product broken. 😡",
        "just okay. nothing special. 3/10 maybe 🤷‍♂️",
        "Loved the color, but the fit was terrible :(",
        "Refunded. Not worth the price!!! http://badshop.com",
        "Thanks @brand for the quick support!!"
    ],
    "label": [
        "positive", "negative", "neutral", "positive", "positive",
        "negative", "neutral", "negative", "negative", "positive"
    ]
}

df = pd.DataFrame(data) 
df



Unnamed: 0,text,label
0,I LOOOVE this product 😍😍!!! Highly recommended...,positive
1,Worst. Experience. Ever. Will NEVER buy again!...,negative
2,"meh... it was okay, I guess. kinda boring tho 🙄",neutral
3,ABSOLUTELY AMAZING SERVICE!!! 😍💯 www.company.com,positive
4,@brand You guys rock! Keep it up 👏🔥🔥🔥,positive
5,"Totally disappointed. Delivery late, product b...",negative
6,just okay. nothing special. 3/10 maybe 🤷‍♂️,neutral
7,"Loved the color, but the fit was terrible :(",negative
8,Refunded. Not worth the price!!! http://badsho...,negative
9,Thanks @brand for the quick support!!,positive


### 2. Change to a csv File

In [2]:
df.to_csv('../data/test_sample.csv')
df.head()

Unnamed: 0,text,label
0,I LOOOVE this product 😍😍!!! Highly recommended...,positive
1,Worst. Experience. Ever. Will NEVER buy again!...,negative
2,"meh... it was okay, I guess. kinda boring tho 🙄",neutral
3,ABSOLUTELY AMAZING SERVICE!!! 😍💯 www.company.com,positive
4,@brand You guys rock! Keep it up 👏🔥🔥🔥,positive


### 3. Lowercasing

In [3]:
df['text'] 
print(type(df['text']))  
df['clean_text'] = df['text'].apply(lambda word:word.lower())
df[['text', 'clean_text']]


<class 'pandas.core.series.Series'>


Unnamed: 0,text,clean_text
0,I LOOOVE this product 😍😍!!! Highly recommended...,i looove this product 😍😍!!! highly recommended...
1,Worst. Experience. Ever. Will NEVER buy again!...,worst. experience. ever. will never buy again!...
2,"meh... it was okay, I guess. kinda boring tho 🙄","meh... it was okay, i guess. kinda boring tho 🙄"
3,ABSOLUTELY AMAZING SERVICE!!! 😍💯 www.company.com,absolutely amazing service!!! 😍💯 www.company.com
4,@brand You guys rock! Keep it up 👏🔥🔥🔥,@brand you guys rock! keep it up 👏🔥🔥🔥
5,"Totally disappointed. Delivery late, product b...","totally disappointed. delivery late, product b..."
6,just okay. nothing special. 3/10 maybe 🤷‍♂️,just okay. nothing special. 3/10 maybe 🤷‍♂️
7,"Loved the color, but the fit was terrible :(","loved the color, but the fit was terrible :("
8,Refunded. Not worth the price!!! http://badsho...,refunded. not worth the price!!! http://badsho...
9,Thanks @brand for the quick support!!,thanks @brand for the quick support!!


###  4. Remove Punctuation and Symbols

In [4]:
import string
print(string.punctuation)
mytable = str.maketrans('','',string.punctuation)
df['clean_text'] = df['clean_text'].apply(lambda word : word.translate(mytable)) 
df['clean_text']


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


0    i looove this product 😍😍 highly recommended aw...
1        worst experience ever will never buy again 🤮🤬
2           meh it was okay i guess kinda boring tho 🙄
3          absolutely amazing service 😍💯 wwwcompanycom
4                  brand you guys rock keep it up 👏🔥🔥🔥
5    totally disappointed delivery late product bro...
6             just okay nothing special 310 maybe 🤷‍♂️
7            loved the color but the fit was terrible 
8          refunded not worth the price httpbadshopcom
9                   thanks brand for the quick support
Name: clean_text, dtype: object

### 5. Remove Stopwords

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['clean_text'].apply(
  lambda text :  ''.join(word for word in text.split() if word not in stop_words)
)
df

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /home/ahmed-
[nltk_data]     sameh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label,clean_text,no_stopwords
0,I LOOOVE this product 😍😍!!! Highly recommended...,positive,i looove this product 😍😍 highly recommended aw...,loooveproduct😍😍highlyrecommendedawesome
1,Worst. Experience. Ever. Will NEVER buy again!...,negative,worst experience ever will never buy again 🤮🤬,worstexperienceeverneverbuy🤮🤬
2,"meh... it was okay, I guess. kinda boring tho 🙄",neutral,meh it was okay i guess kinda boring tho 🙄,mehokayguesskindaboringtho🙄
3,ABSOLUTELY AMAZING SERVICE!!! 😍💯 www.company.com,positive,absolutely amazing service 😍💯 wwwcompanycom,absolutelyamazingservice😍💯wwwcompanycom
4,@brand You guys rock! Keep it up 👏🔥🔥🔥,positive,brand you guys rock keep it up 👏🔥🔥🔥,brandguysrockkeep👏🔥🔥🔥
5,"Totally disappointed. Delivery late, product b...",negative,totally disappointed delivery late product bro...,totallydisappointeddeliverylateproductbroken😡
6,just okay. nothing special. 3/10 maybe 🤷‍♂️,neutral,just okay nothing special 310 maybe 🤷‍♂️,okaynothingspecial310maybe🤷‍♂️
7,"Loved the color, but the fit was terrible :(",negative,loved the color but the fit was terrible,lovedcolorfitterrible
8,Refunded. Not worth the price!!! http://badsho...,negative,refunded not worth the price httpbadshopcom,refundedworthpricehttpbadshopcom
9,Thanks @brand for the quick support!!,positive,thanks brand for the quick support,thanksbrandquicksupport


### 6. Remove Emojis

In [7]:
import emoji

df['no_emoji'] = df['no_stopwords'].apply(lambda row: emoji.replace_emoji(row, replace=''))
df



Unnamed: 0,text,label,clean_text,no_stopwords,no_emoji
0,I LOOOVE this product 😍😍!!! Highly recommended...,positive,i looove this product 😍😍 highly recommended aw...,loooveproduct😍😍highlyrecommendedawesome,loooveproducthighlyrecommendedawesome
1,Worst. Experience. Ever. Will NEVER buy again!...,negative,worst experience ever will never buy again 🤮🤬,worstexperienceeverneverbuy🤮🤬,worstexperienceeverneverbuy
2,"meh... it was okay, I guess. kinda boring tho 🙄",neutral,meh it was okay i guess kinda boring tho 🙄,mehokayguesskindaboringtho🙄,mehokayguesskindaboringtho
3,ABSOLUTELY AMAZING SERVICE!!! 😍💯 www.company.com,positive,absolutely amazing service 😍💯 wwwcompanycom,absolutelyamazingservice😍💯wwwcompanycom,absolutelyamazingservicewwwcompanycom
4,@brand You guys rock! Keep it up 👏🔥🔥🔥,positive,brand you guys rock keep it up 👏🔥🔥🔥,brandguysrockkeep👏🔥🔥🔥,brandguysrockkeep
5,"Totally disappointed. Delivery late, product b...",negative,totally disappointed delivery late product bro...,totallydisappointeddeliverylateproductbroken😡,totallydisappointeddeliverylateproductbroken
6,just okay. nothing special. 3/10 maybe 🤷‍♂️,neutral,just okay nothing special 310 maybe 🤷‍♂️,okaynothingspecial310maybe🤷‍♂️,okaynothingspecial310maybe
7,"Loved the color, but the fit was terrible :(",negative,loved the color but the fit was terrible,lovedcolorfitterrible,lovedcolorfitterrible
8,Refunded. Not worth the price!!! http://badsho...,negative,refunded not worth the price httpbadshopcom,refundedworthpricehttpbadshopcom,refundedworthpricehttpbadshopcom
9,Thanks @brand for the quick support!!,positive,thanks brand for the quick support,thanksbrandquicksupport,thanksbrandquicksupport


### 7. Tokenization

In [13]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from wordsegment import load, segment

# Download 'punkt' to a custom directory and append it to nltk path
nltk.download('punkt')

# Initialize wordsegment
load()
# Check for NaN or empty strings
df['no_emoji'] = df['no_emoji'].fillna('').astype(str)
print(df['no_emoji'].head())  # Confirm it prints strings like 'absolutelyamazingservice...'

df['segmented'] = df['no_emoji'].apply(lambda text: ' '.join(segment(text)))
tokenizer = lambda text: re.findall(r'\b\w+\b', text.lower())
df['tokenized'] = df['segmented'].apply(tokenizer)
df['tokenized']


[nltk_data] Downloading package punkt to /home/ahmed-
[nltk_data]     sameh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0    loooveproducthighlyrecommendedawesome
1              worstexperienceeverneverbuy
2               mehokayguesskindaboringtho
3    absolutelyamazingservicewwwcompanycom
4                        brandguysrockkeep
Name: no_emoji, dtype: object


0      [looove, product, highly, recommended, awesome]
1                [worst, experience, ever, never, buy]
2               [meh, okay, guess, kinda, boring, tho]
3    [absolutely, amazing, service, www, company, com]
4                            [brand, guys, rock, keep]
5    [totally, disappointed, delivery, late, produc...
6               [okay, nothing, special, 310, may, be]
7                        [loved, color, fit, terrible]
8       [refunded, worth, price, http, bad, shop, com]
9                      [thanks, brand, quick, support]
Name: tokenized, dtype: object

### 8. Lemmatization Test

In [24]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('punkt_tab')
def get_wordnet_pos(treebank_tag):
    # Map POS tag to WordNet POS tag for lemmatizer
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default

sentence = "The striped bats are hanging on their feet for best"
tokens = word_tokenize(sentence)
print("Tokens : ",tokens)
#Takes a list of words (tokens) from your sentence.
#Assigns a Part-of-Speech (POS) tag to each word, like noun, verb, adjective, etc.
#Returns a list of tuples: (word, POS_tag) for every token.
pos_tags = pos_tag(tokens)
print("POS tag : ",pos_tags)
lemmatizer = WordNetLemmatizer()
print(lemmatizer)
lemmatized_sentence = []
for token, tag in pos_tags:
    wn_tag = get_wordnet_pos(tag)
    lemma = lemmatizer.lemmatize(token, wn_tag)
    lemmatized_sentence.append(lemma)

print("Original:", sentence)
print("Lemmatized:", " ".join(lemmatized_sentence))
# "bats" → "bat"

# "are" → "be"

# "hanging" → "hang"

# "feet" → "foot"

Tokens :  ['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']
POS tag :  [('The', 'DT'), ('striped', 'JJ'), ('bats', 'NNS'), ('are', 'VBP'), ('hanging', 'VBG'), ('on', 'IN'), ('their', 'PRP$'), ('feet', 'NNS'), ('for', 'IN'), ('best', 'JJS')]
<WordNetLemmatizer>
Original: The striped bats are hanging on their feet for best
Lemmatized: The striped bat be hang on their foot for best


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/ahmed-sameh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ahmed-
[nltk_data]     sameh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ahmed-
[nltk_data]     sameh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
