## Text preprocessing (clean data)

In [36]:
import pandas as pd
import string

In [37]:
df = pd.read_csv("twitter_training.csv")

In [38]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [39]:
df.shape

(74681, 4)

In [40]:
df.sample(10)

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
30279,7204,LeagueOfLegends,Neutral,3 RhandlerR RhandlerR RhandlerR RhandlerR . Fi...
60759,4810,GrandTheftAuto(GTA),Negative,@RockstarSupport I remember ’ i m having issue...
68730,3769,Cyberpunk2077,Positive,"A shame, but if you know CD Projects Red and k..."
37465,5224,Hearthstone,Neutral,I played Face Hunter at Heartstone and I regre...
9599,12853,Xbox(Xseries),Neutral,Xbox Series X Pre-Orders Have Been A Debacle S...
16142,3163,Dota2,Neutral,We are honored and Excited and announce that y...
22541,4259,CS-GO,Irrelevant,"that's it. I don't care what u say, I'm the be..."
51159,6381,FIFA,Positive,"Am I the only one who likes FIFA 21, is gonna ..."
44695,11676,Verizon,Negative,Verizon always tries to add bullshit when it c...
66835,7044,johnson&johnson,Irrelevant,for FUCKS SAKE!!!!!!! A MONTH JUST ONE MONTH W...


### step 1: convert the text into lower case

In [41]:
df.drop(columns= ["2401", "Borderlands"], inplace=True)

In [42]:
df.head()

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,"
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [43]:
df["cleaned_twitter"] = df["im getting on borderlands and i will murder you all ,"].str.lower()

In [44]:
df.sample(10)

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,",cleaned_twitter
50569,Irrelevant,A new bomb has fallen... the club is already s...,a new bomb has fallen... the club is already s...
989,Positive,Sweet,sweet
64268,Positive,Salute!!,salute!!
48260,Positive,I like it when they distribute free packs of w...,i like it when they distribute free packs of w...
42873,Irrelevant,Trickle in ya fuckers,trickle in ya fuckers
8064,Positive,I thought I'd posted them LMAO here's me and m...,i thought i'd posted them lmao here's me and m...
41653,Irrelevant,Ban for Battlefield 4 player SlothyNips has oc...,ban for battlefield 4 player slothynips has oc...
59432,Negative,Wanna Know How You Keep Inform When The News I...,wanna know how you keep inform when the news i...
54288,Negative,@ InfinityWard @ CallofDuty can't complete mis...,@ infinityward @ callofduty can't complete mis...
74552,Neutral,Check out this beautiful AORUS GeForce RTX SPE...,check out this beautiful aorus geforce rtx spe...


## Removal of punctuation ( period (called “full stop” in the UK), question mark, exclamation point, comma, colon, semicolon, dash, hyphen, brackets, braces, parentheses, apostrophe, quotation mark, and ellipsis.)

In [45]:
df.isna().sum()

Positive                                                   0
im getting on borderlands and i will murder you all ,    686
cleaned_twitter                                          686
dtype: int64

In [46]:
df.dropna(inplace=True)

In [47]:
df["cleaned_twitter"] = df["cleaned_twitter"].astype(str)

In [48]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [49]:
def RemovePunctuation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('','',punctuations))

In [50]:
df["cleaned_twitter"] = df["cleaned_twitter"].apply(lambda x: RemovePunctuation(x))

In [51]:
df.sample(10)

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,",cleaned_twitter
52015,Neutral,I particularly liked the part about how a team...,i particularly liked the part about how a team...
27327,Negative,A Ghost of the tsushima is a better assassins ...,a ghost of the tsushima is a better assassins ...
29838,Irrelevant,HAVE YOVER TRIED YUMMY FILIPINOS!? | SPANISH T...,have yover tried yummy filipinos spanish typi...
54129,Negative,V @CallofDuty @Blizzard_Ent seriously. I want ...,v callofduty blizzardent seriously i want this...
57330,Neutral,Don't come into the kitchen.,dont come into the kitchen
50569,Irrelevant,A new bomb has fallen... the club is already s...,a new bomb has fallen the club is already shak...
6629,Neutral,Urban CoCo Women's Basic Versatile Stretchy Fl...,urban coco womens basic versatile stretchy fla...
8060,Negative,overwatch just really listens to super famous ...,overwatch just really listens to super famous ...
54820,Irrelevant,..... just to call for panic.. he was right at...,just to call for panic he was right at mile t...
59934,Negative,Stupidity!,stupidity


## Removal of stopwords ( Stop words are common words, like "a," "the," and "is," that are often filtered out or removed from text during processing, as they typically don't carry much meaning and can hinder analysis. )

In [54]:
# stopwords are available in nltk library
from nltk.corpus import stopwords

In [56]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [59]:
stopwords.words("english")

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [60]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [61]:
df["cleaned_twitter"] = df["cleaned_twitter"].apply(lambda x: remove_stopwords(x))

In [62]:
df.sample(10)

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,",cleaned_twitter
48529,Neutral,I'm going to do a cosplay rem (Death Note the ...,im going cosplay rem death note musical need h...
16279,Irrelevant,I am dota 2 and i like demetrius - Casey,dota 2 like demetrius casey
31979,Positive,Good times. 🤣,good times 🤣
58076,Negative,@Rainbow6Game Server are Available in Xbox 🥺,rainbow6game server available xbox 🥺
13930,Negative,I DONT WHO PARK R N N 1 @Beluba 2 @Beluba D @B...,dont park r n n 1 beluba 2 beluba beluba belub...
70202,Negative,@GhostRecon @UbisoftSupport Your console is s...,ghostrecon ubisoftsupport console still broke ...
16261,Negative,A Dangerous Toxic Marine Server,dangerous toxic marine server
64170,Negative,@ EA has not only the worst game in @ EAMadden...,ea worst game eamaddennfl also worst servers
15869,Irrelevant,holy shit they gave Diretide,holy shit gave diretide
26223,Neutral,It is very or very large,large


## Removing Frequent words

In [64]:
from collections import Counter
word_count = Counter()
for text in df["cleaned_twitter"]:
    for word in text.split():
        word_count[word] += 1
word_count.most_common(20)

[('game', 8056),
 ('like', 4878),
 ('2', 4057),
 ('get', 3941),
 ('im', 3827),
 ('one', 3581),
 ('good', 3548),
 ('play', 3413),
 ('really', 3260),
 ('love', 3239),
 ('new', 3208),
 ('time', 2944),
 ('people', 2878),
 ('johnson', 2831),
 ('see', 2769),
 ('shit', 2753),
 ('3', 2583),
 ('best', 2496),
 ('still', 2450),
 ('’', 2412)]

In [65]:
# Removing the top 3 frequent words
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [66]:
df["cleaned_twitter"] = df["cleaned_twitter"].apply(lambda x: remove_freq_words(x))

In [67]:
df.sample(10)

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,",cleaned_twitter
15752,Negative,and i cant even get a season 1 battle pass. li...,cant even get season 1 battle pass life unfair
29013,Neutral,2008 Apex Legends GOLD III RANKED MATCHES - Fr...,2008 apex legends gold iii ranked matches tras...
66531,Positive,By combining product.. Learn something more : ...,combining product learn something 4 buff 0 ly ...
62404,Irrelevant,I,
45169,Neutral,@ verizon,verizon
30950,Positive,Day ten of playing League of Brothers. My wife...,day ten playing league brothers wife left kids...
36974,Neutral,Microsoft staff struggle beneath the strains o...,microsoft staff struggle beneath strains 52000...
69440,Positive,That Cyberpunk Xbox One X sure felt impressive...,cyberpunk xbox one x sure felt impressive impr...
48276,Positive,Today's market recap - made big profits from H...,todays market recap made big profits home depo...
12001,Negative,MyTeam Man is stupid. Why are the packs so bad...,myteam man stupid packs bad deduction packs ex...


### Removal of Rare Words

In [68]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{'challen',
 'doxmaxbitcc',
 'intend',
 'maxbit',
 'maxbitcc',
 'maxbitccnvidiadoesnt',
 'netcomgooglechallen',
 'nvidiadoes',
 'nvidiadoesnt'}

In [69]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [70]:
df["cleaned_twitter"] = df["cleaned_twitter"].apply(lambda x: remove_rare_words(x))

In [71]:
df.sample(10)

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,",cleaned_twitter
54440,Positive,This isn't amateur hour! I'm still crushing so...,isnt amateur hour im still crushing call duty ...
22072,Positive,then hear the people who want to play VALORANT...,hear people want play valorant saying pursue p...
44833,Positive,2.0 @ Charitymiles for @ alzassociation. Thank...,20 charitymiles alzassociation thanks verizon ...
65326,Negative,At last let players grind down your game you g...,last let players grind get extra gear want pay...
3581,Positive,This looks amazing! Have high hopes for this.....,looks amazing high hopes considering shite rec...
42787,Irrelevant,Got a whole load of work including some PUBG F...,got whole load work including pubg funny momen...
64179,Negative,@EAMaddenNFL... wtf think is this some garbage...,eamaddennfl wtf think garbage ass sound track ...
18094,Positive,still can’t wait for my brother to purchase th...,still can’t wait brother purchase ps5 simply w...
55931,Negative,"Here's the clip, before I exited the game you ...",heres clip exited see would let fully load thi...
57133,Positive,Nice lil fuze play here.,nice lil fuze play


## Removal of Special Characters

In [72]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]',' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [74]:
df["cleaned_twitter"] = df["cleaned_twitter"].apply(lambda x: remove_spl_chars(x))

In [76]:
df.sample(10)

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,",cleaned_twitter
54346,Negative,Garbage ass @CallofDuty a lagged me out 60 kil...,garbage ass callofduty lagged 60 kills solo ma...
65952,Neutral,Johnson & Johnson suspends COVID-19 vaccine tr...,johnson johnson suspends covid19 vaccine trial...
54985,Positive,"bitch, you don't need a call of duty anyway.",bitch dont need call duty anyway
14485,Positive,The. Best. Feeling. redd.it / gc4gm3,best feeling reddit gc4gm3
59016,Neutral,Panic at @ YouTube Panic at @ Facebook Panic a...,panic youtube panic facebook panic twitter rea...
32158,Positive,a,
22893,Neutral,bruh i d haven't played me csgo for like a fuc...,bruh havent played csgo fucking week wtf
72256,Neutral,Me How haha <unk>,haha unk
70855,Positive,Now that my AI teammates are back in @ GhostRe...,ai teammates back ghostrecon breakpoint would ...
69588,Positive,"Yes, Boy!.",yes boy


## Stemming ( Stemming is a technique used in Natural Language Processing (NLP) to reduce words to their root or base form by removing suffixes. The goal is to normalize words and improve text processing efficiency.)

In [77]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [79]:
df["stemmed_twitter"] = df["cleaned_twitter"].apply(lambda x: stem_words(x))

In [80]:
df.sample(10)

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,",cleaned_twitter,stemmed_twitter
4318,Positive,The menu looks so nice. I just dont understand...,menu looks nice dont understand everyone thoug...,menu look nice dont understand everyon thought...
26947,Positive,"""Nothing is true, everything is permitted. We ...",nothing true everything permitted work dark se...,noth true everyth permit work dark serv light ...
14186,Positive,Its it's ain't much. but Im still happy I got ...,aint much im still happy got windowithcte5f,aint much im still happi got windowithcte5f
61593,Negative,@RockstarGames is the user prompt for deleting...,rockstargames user prompt deleting vehicles fu...,rockstargam user prompt delet vehicl full fami...
69627,Positive,This looks... kinda clean!,looks kinda clean,look kinda clean
53017,Irrelevant,Fact. Max Payne 3 is a low-key Rockstars best ...,fact max payne 3 lowkey rockstars best,fact max payn 3 lowkey rockstar best
20358,Neutral,I just earned the [Ahead of the Curve: N'Zoth ...,earned ahead curve nzoth corruptor performance,earn ahead curv nzoth corruptor perform
10966,Irrelevant,Colt is at it again. You guys ready for my won...,colt guys ready wonderful sweet sounds shhhhhh,colt guy readi wonder sweet sound shhhhhh
43127,Negative,youtu.be/c-aixISTD2U. Again 47 apps banned. PU...,youtubecaixistd2u 47 apps banned pubg ban pann...,youtubecaixistd2u 47 app ban pubg ban pannuvan...
71859,Neutral,* 5 Stewart voice * 2 Nooooooo stop iiiiiiiit,5 stewart voice nooooooo stop iiiiiiiit,5 stewart voic nooooooo stop iiiiiiiit


## Lemmatization is the process of reducing words to their base or dictionary form (lemma) while considering the word's meaning and part of speech (POS). Unlike stemming, it produces real words.

## POS tagging is the process of labeling words in a sentence based on their grammatical category (noun, verb, adjective, etc.). It helps in understanding sentence structure and is useful for lemmatization, named entity recognition (NER), and text parsing.

In [82]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [84]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...


True

In [85]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

In [86]:
def lemmatize_words(text):
    #find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

In [89]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [90]:
df["lemmatized_twitter"] = df["cleaned_twitter"].apply(lambda x: lemmatize_words(x))

In [91]:
df.sample(10)

Unnamed: 0,Positive,"im getting on borderlands and i will murder you all ,",cleaned_twitter,stemmed_twitter,lemmatized_twitter
13411,Negative,"@ N2K Can you guys explain why this ""special o...",n2k guys explain special offer keeps growing b...,n2k guy explain special offer keep grow buy no...,n2k guy explain special offer keep grow buy no...
5348,Neutral,I didn't have massive data in . . Here's one d...,didnt massive data heres one detailed review,didnt massiv data here one detail review,didnt massive data here one detail review
19788,Neutral,Unluckiest roller I've seen so far....,unluckiest roller ive seen far,unluckiest roller ive seen far,unluckiest roller ive see far
71197,Negative,I hate playing against computer players not on...,hate playing computer players play crap also c...,hate play comput player play crap also chanc mous,hate playing computer player play crap also ch...
65043,Negative,More @EAMaddenNFL madden The 21 ’ s career mod...,eamaddennfl madden 21 career mode sucks please...,eamaddennfl madden 21 career mode suck pleas b...,eamaddennfl madden 21 career mode suck please ...
59515,Irrelevant,Another reason for QT's superiority.,another reason qts superiority,anoth reason qt superior,another reason qts superiority
42220,Irrelevant,Holy shit the kids are out my numbers again on...,holy shit kids numbers oce pubg bgk 0 0 one do...,holi shit kid number oce pubg bgk 0 0 one douy...,holy shit kid number oce pubg bgk 0 0 one douy...
60761,Positive,Grand Theft Auto 5 (EU) for a good Price g2a.c...,grand theft auto 5 eu good price g2acomrquicks...,grand theft auto 5 eu good price g2acomrquicks...,grand theft auto 5 eu good price g2acomrquicks...
11622,Neutral,It could go very well... or go horribly wrong.,could go well go horribly wrong,could go well go horribl wrong,could go well go horribly wrong
36322,Negative,Though this is great news personally. al.ly/30...,though great news personally ally30eeddn,though great news person ally30eeddn,though great news personally ally30eeddn


## Removing URLS

In [92]:
text = "https://www.yogesh.com is the url"

In [93]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

In [94]:
remove_url(text)

' is the url'

## Removal of HTML tags

In [95]:
text = "<html> <body><h1> hehhehe </h1><p> This is NLP text preprocessing </p> </body></html>"

In [96]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [97]:
remove_html_tags(text)

'  hehhehe  This is NLP text preprocessing  '

## spelling correction

In [113]:
text = 'natur is beuty'

In [114]:
from spellchecker import SpellChecker
spell = SpellChecker()

In [115]:
def correct_spellings(text):
    corrected_text = []
    misspelled_text = spell.unknown(text.split())
    print(misspelled_text)
    for word in text.split():
        if word in misspelled_text:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [116]:
correct_spellings(text)

{'natur', 'beuty'}


'nature is beauty'

## word tokenize

In [117]:
from nltk.tokenize import word_tokenize

In [118]:
data = " Hello my name is yogesh pant"

In [121]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [122]:
tokenized_word = word_tokenize(data)

In [123]:
print(tokenized_word)

['Hello', 'my', 'name', 'is', 'yogesh', 'pant']


## Sentence tokenizer

In [124]:
from nltk.tokenize import sent_tokenize

In [125]:
sentences = """Lorem Ipsum Generator is a simple,
free online tool that can help you create dummy text strings for any use.
Whether you're a designer or a software developer, this tool can generate text easily. 
Just choose how many words/sentences/paragraphs you need and press 'Generate'."""

In [126]:
tokenized_sent = sent_tokenize(sentences)

In [127]:
print(tokenized_sent)

['Lorem Ipsum Generator is a simple,\nfree online tool that can help you create dummy text strings for any use.', "Whether you're a designer or a software developer, this tool can generate text easily.", "Just choose how many words/sentences/paragraphs you need and press 'Generate'."]
