In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# Loading data
train_df = pd.read_csv("text_emotion.csv");

# Here the content column is referred to as the "tweets" by different users

In [3]:
#Training Data Set
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?


In [4]:
# Training Data Set Information
print("Training Data Set Info - Total Rows | Total Columns | Total Null Values")
print(train_df.info())

Training Data Set Info - Total Rows | Total Columns | Total Null Values
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   author     40000 non-null  object
 3   content    40000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.2+ MB
None


In the above dataset we can see it is not properly structured, so we have to struture it for better analysis , we first need to
structure the content(tweets), remove the unwanted words, replace the mispelled words with the correct ones, replace the abbreviation 
with full words. 


# Data processing & cleaning

* Step A : Converting html entities
* Step B : Removing "@user" from all the content(tweets)
* Step C : Changing all the content(tweets) into lowercase
* Step D : Apostrophe Lookup
* Step E : Short Word Lookup
* Step F : Replacing Punctuations with space
* Step G : Replacing Special Characters with space
* Step H : Replacing Numbers (integers) with space
* Step I : Removing words whom length is 1
* Step J : Spelling Correction - With TextBlob Library

### Step A : Converting html entities

In [5]:
print("""Step A : Converting html entities i.e. (&lt; &gt; &amp;)
( "&lt;" is converted to “<” and "&amp;" is converted to “&”)""")

Step A : Converting html entities i.e. (&lt; &gt; &amp;)
( "&lt;" is converted to “<” and "&amp;" is converted to “&”)


In [6]:
# Importing HTMLParser
from html.parser import HTMLParser
html_parser = HTMLParser()

In [7]:
# Created a new columns i.e. clean_content contains the same content(tweets) but cleaned version
train_df['clean_content'] = train_df['content'].apply(lambda x: html_parser.unescape(x))
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,@dannycastillo We want to trade with someone w...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,Re-pinging @ghostridah14: why didn't you go to...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...","I should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,Hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,@charviray Charlene my love. I miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,@kelcouch I'm sorry at least it's Friday?


### Step B : Removing "@user" from all the content

In [8]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [9]:
# remove twitter handles (@user)
train_df['clean_content'] = np.vectorize(remove_pattern)(train_df['clean_content'], "@[\w]*")
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,We want to trade with someone who has Houston...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,Re-pinging : why didn't you go to prom? BC my ...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...","I should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,Hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,Charlene my love. I miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,I'm sorry at least it's Friday?


### Step C : Changing all the content into lowercase

In [10]:
train_df['clean_content'] = train_df['clean_content'].apply(lambda x: x.lower())
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends soon!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,we want to trade with someone who has houston...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,re-pinging : why didn't you go to prom? bc my ...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...","i should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,charlene my love. i miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,i'm sorry at least it's friday?


### Step D : Apostrophe Lookup

In [11]:
# Apostrophe Dictionary
apostrophe_lookup_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
    "doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
    "where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}
apostrophe_lookup_dict

{"ain't": 'am not / are not',
 "aren't": 'are not / am not',
 "can't": 'cannot',
 "can't've": 'cannot have',
 "'cause": 'because',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he'd": 'he had / he would',
 "he'd've": 'he would have',
 "he'll": 'he shall / he will',
 "he'll've": 'he shall have / he will have',
 "he's": 'he has / he is',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how has / how is',
 "i'd": 'I had / I would',
 "i'd've": 'I would have',
 "i'll": 'I shall / I will',
 "i'll've": 'I shall have / I will have',
 "i'm": 'I am',
 "i've": 'I have',
 "isn't": 'is not',
 "it'd": 'it had / it would',
 "it'd've": 'it would have',
 "it'll": 'it shall / it will',
 "it'll've": 'it shall have / it will have',
 "it's": 'it has / it is',
 "let's": 'l

In [12]:
def lookup_dict(text, dictionary):
    for word in text.split():
        if word.lower() in dictionary:
            if word.lower() in text.split():
                text = text.replace(word, dictionary[word.lower()])
    return text

In [13]:
train_df['clean_content'] = train_df['clean_content'].apply(lambda x: lookup_dict(x,apostrophe_lookup_dict))
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends soon!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,we want to trade with someone who has houston...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,re-pinging : why did not you go to prom? bc my...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...","i should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,charlene my love. i miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,I am sorry at least it has / it is friday?


### Step E : Short Word Lookup

In [14]:
short_word_lookup_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}
short_word_lookup_dict

{'121': 'one to one',
 'a/s/l': 'age, sex, location',
 'adn': 'any day now',
 'afaik': 'as far as I know',
 'afk': 'away from keyboard',
 'aight': 'alright',
 'alol': 'actually laughing out loud',
 'b4': 'before',
 'b4n': 'bye for now',
 'bak': 'back at the keyboard',
 'bf': 'boyfriend',
 'bff': 'best friends forever',
 'bfn': 'bye for now',
 'bg': 'big grin',
 'bta': 'but then again',
 'btw': 'by the way',
 'cid': 'crying in disgrace',
 'cnp': 'continued in my next post',
 'cp': 'chat post',
 'cu': 'see you',
 'cul': 'see you later',
 'cul8r': 'see you later',
 'cya': 'bye',
 'cyo': 'see you online',
 'dbau': 'doing business as usual',
 'fud': 'fear, uncertainty, and doubt',
 'fwiw': "for what it's worth",
 'fyi': 'for your information',
 'g': 'grin',
 'g2g': 'got to go',
 'ga': 'go ahead',
 'gal': 'get a life',
 'gf': 'girlfriend',
 'gfn': 'gone for now',
 'gmbo': 'giggling my butt off',
 'gmta': 'great minds think alike',
 'h8': 'hate',
 'hagn': 'have a good night',
 'hdop': 'help d

In [15]:
train_df['clean_content'] = train_df['clean_content'].apply(lambda x: lookup_dict(x,short_word_lookup_dict))
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends soon!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,we want to trade with someone who has houston...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,re-pinging : why did not you go to prom? bc my...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...","i should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,charlene my love. i miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,I am sorry at least it has / it is friday?


### Step F : Replacing Punctuations with space

In [16]:
train_df['clean_content'] = train_df['clean_content'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhh waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends soon
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,we want to trade with someone who has houston...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,re pinging why did not you go to prom bc my...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...",i should be sleep but im not thinking about ...
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,hmmm http www djhero com is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,charlene my love i miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,I am sorry at least it has it is friday


### Step G : Replacing special characters with space

In [17]:
train_df['clean_content'] = train_df['clean_content'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhh waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends soon
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,we want to trade with someone who has houston...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,re pinging why did not you go to prom bc my...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...",i should be sleep but im not thinking about ...
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,hmmm http www djhero com is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,charlene my love i miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,I am sorry at least it has it is friday


### Step H : Replacing numbers(integers) with space

In [18]:
train_df['clean_content'] = train_df['clean_content'].apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x))
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhh waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends soon
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,we want to trade with someone who has houston...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,re pinging why did not you go to prom bc my...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...",i should be sleep but im not thinking about ...
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,hmmm http www djhero com is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,charlene my love i miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,I am sorry at least it has it is friday


### Step I : Removing words whom length is 1

In [24]:
train_df['clean_content'] = train_df['clean_content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
train_df['clean_content'][0:5]

0    know was listenin to bad habit earlier and sta...
1    layin bed with headache ughhhh waitin on your ...
2                       funeral ceremony gloomy friday
3                  wants to hang out with friends soon
4    we want to trade with someone who has houston ...
Name: clean_content, dtype: object

### Step J : Spelling Correction - With TextBlob Library

In [25]:
# Spelling correction is a cool feature which TextBlob offers
# Applying TextBlob on our dataset
from textblob import TextBlob

In [26]:
text = train_df['clean_content'][0:5].apply(lambda x: str(TextBlob(x).correct()))
text

0    know was listening to bad habit earlier and st...
1    laying bed with headache ughhhh waiting on you...
2                       funeral ceremony gloomy friday
3                  wants to hang out with friends soon
4    we want to trade with someone who has houston ...
Name: clean_content, dtype: object

In [29]:
# Importing stop words from nltk.corpus
from nltk.corpus import stopwords
# Importing word_tokenize from nltk.tokenize
from nltk.tokenize import word_tokenize

In [31]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [32]:
# Creating token for the clean contents
train_df['content_token'] = train_df['clean_content'].apply(lambda x: word_tokenize(x))

In [33]:
## Fully formated contents & there tokens
train_df.head(10)

Unnamed: 0,tweet_id,sentiment,author,content,clean_content,content_token
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,know was listenin to bad habit earlier and sta...,"[know, was, listenin, to, bad, habit, earlier,..."
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin bed with headache ughhhh waitin on your ...,"[layin, bed, with, headache, ughhhh, waitin, o..."
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday,"[funeral, ceremony, gloomy, friday]"
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends soon,"[wants, to, hang, out, with, friends, soon]"
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,we want to trade with someone who has houston ...,"[we, want, to, trade, with, someone, who, has,..."
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...,re pinging why did not you go to prom bc my bo...,"[re, pinging, why, did, not, you, go, to, prom..."
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ...",should be sleep but im not thinking about an o...,"[should, be, sleep, but, im, not, thinking, ab..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down,hmmm http www djhero com is down,"[hmmm, http, www, djhero, com, is, down]"
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you,charlene my love miss you,"[charlene, my, love, miss, you]"
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?,am sorry at least it has it is friday,"[am, sorry, at, least, it, has, it, is, friday]"


In [35]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [36]:
# Importing stop words from NLTK corpus for english language
stop_words = set(stopwords.words('english'))

In [37]:
# Created new columns of tokens - where stop words are being removed
train_df['content_token_filtered'] = train_df['content_token'].apply(lambda x: [word for word in x if not word in stop_words])

## Tokens columns with stop words and without stop words
train_df[['content_token', 'content_token_filtered']].head(5)

Unnamed: 0,content_token,content_token_filtered
0,"[know, was, listenin, to, bad, habit, earlier,...","[know, listenin, bad, habit, earlier, started,..."
1,"[layin, bed, with, headache, ughhhh, waitin, o...","[layin, bed, headache, ughhhh, waitin, call]"
2,"[funeral, ceremony, gloomy, friday]","[funeral, ceremony, gloomy, friday]"
3,"[wants, to, hang, out, with, friends, soon]","[wants, hang, friends, soon]"
4,"[we, want, to, trade, with, someone, who, has,...","[want, trade, someone, houston, tickets, one]"
