# DATA PREPROCESSING 

In [1]:
import pandas as pd
import numpy as np

## Data Exploration 

### Here I just visualize the data to understand it better. 

In [2]:
data_identification_df = pd.read_csv("data_identification.csv")
data_identification_df.head()

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train


In [3]:
emotions_df = pd.read_csv("emotion.csv")
emotions_df.head()

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation


In [4]:
tweets_df = pd.read_json('tweets_DM.json',lines=True)

In [5]:
tweets_df.head()

Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets


## Tweets DF Exploration/Feature Creation

In [10]:
tweets_df.head()

Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets


In [11]:
unique_tweets = tweets_df["_score"].nunique()
(unique_tweets)

1024

#### Create a column in "tweets_df" with the extracted tweet_id from _source column. We can treat this as a dictionary within a dictionary:

In [12]:
tweets_df.iloc[5000]['_source']

{'tweet': {'hashtags': ['beer', 'benjaminfranklin'],
  'tweet_id': '0x2ff708',
  'text': '"Beer is proof that God loves us and wants us to be happy" - Benjamin Franklin  #beer <LH> #benjaminfranklin'}}

#### The only key of the large dictionary of every row in _source is 'tweet'. From here we get a subdictionary, with keys hashtags, tweet_id, and text. After some testing to see if the idea works, then I created 2 definitions to get the text and the tweet ID.

In [13]:
one_row = tweets_df.iloc[5000]['_source']  #grab only 1 row to test
new_dict = one_row['tweet']  

#Use the subkey found, tweet_id, to get the feature needed:
print(new_dict['tweet_id'])

#---------------------------------------------------------------

#def to get tweet_id from dictionary within a dictionary:
def get_tweet_id(row):
    return row['tweet']['tweet_id']

#def to get text from dictionary within a dictionary:
def get_text(row):
    return row['tweet']['text']

0x2ff708


#### Here I append the new extracted features from the [_source] column.

In [14]:
tweets_df['tweet_id'] = tweets_df._source.apply(lambda x: get_tweet_id(x))
tweets_df['text'] = tweets_df._source.apply(lambda x: get_text(x))
tweets_df

Unnamed: 0,_score,_index,_source,_crawldate,_type,tweet_id,text
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets,0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets,0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets,0x28b412,"Confident of your obedience, I write to you, k..."
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets,0x2de201,"""Trust is not the same as faith. A friend is s..."
...,...,...,...,...,...,...,...
1867530,827,hashtag_tweets,"{'tweet': {'hashtags': ['mixedfeeling', 'butim...",2015-05-12 12:51:52,tweets,0x316b80,When you buy the last 2 tickets remaining for ...
1867531,368,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x29d0...",2017-10-02 17:54:04,tweets,0x29d0cb,I swear all this hard work gone pay off one da...
1867532,498,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2a6a...",2016-10-10 11:04:32,tweets,0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...
1867533,840,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x24fa...",2016-09-02 14:25:06,tweets,0x24faed,"Ah, corporate life, where you can date <LH> us..."


#### Since not all the data is relevant (_index, _source, _crawldate, _type are NOT), I created a new Dataframe with only the relevant columns.

In [15]:
twitter_data = tweets_df[["_score","tweet_id","text"]]

In [16]:
twitter_data

Unnamed: 0,_score,tweet_id,text
0,391,0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,433,0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,232,0x28b412,"Confident of your obedience, I write to you, k..."
3,376,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,989,0x2de201,"""Trust is not the same as faith. A friend is s..."
...,...,...,...
1867530,827,0x316b80,When you buy the last 2 tickets remaining for ...
1867531,368,0x29d0cb,I swear all this hard work gone pay off one da...
1867532,498,0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...
1867533,840,0x24faed,"Ah, corporate life, where you can date <LH> us..."


# Joining tables 

#### Joining with data_identification_df. With the new dataset that we obtained, now we join first with the data_identification_df:

In [17]:
twitter_data = twitter_data.merge(data_identification_df, how='inner')

#### Unique twitter IDs are equal to length of dataset so, no 2 IDs repeat:

In [18]:
print(len(twitter_data), twitter_data.tweet_id.nunique())

1867535 1867535


In [19]:
twitter_data.identification.unique()

array(['train', 'test'], dtype=object)

In [20]:
twitter_data.head()

Unnamed: 0,_score,tweet_id,text,identification
0,391,0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,433,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
2,232,0x28b412,"Confident of your obedience, I write to you, k...",test
3,376,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
4,989,0x2de201,"""Trust is not the same as faith. A friend is s...",test


## Dividing into train and test:

#### This dataset will be use to further divide it intro another training, validation, and test data.

In [21]:
twitter_train = twitter_data[twitter_data['identification']== "train"].reset_index()
twitter_train

Unnamed: 0,index,_score,tweet_id,text,identification
0,0,391,0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,1,433,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
2,3,376,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
3,5,120,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train
4,6,1021,0x2c91a8,Still waiting on those supplies Liscus. <LH>,train
...,...,...,...,...,...
1455558,1867526,94,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,train
1455559,1867527,627,0x38959e,In every circumtance I'd like to be thankful t...,train
1455560,1867528,274,0x2cbca6,there's currently two girls walking around the...,train
1455561,1867533,840,0x24faed,"Ah, corporate life, where you can date <LH> us...",train


#### This one (Testing Data) will only be used to predict:

In [22]:
twitter_test = twitter_data[twitter_data['identification']== "test"].reset_index()
twitter_test = twitter_test.drop("index", axis = 1)
twitter_test

Unnamed: 0,_score,tweet_id,text,identification
0,232,0x28b412,"Confident of your obedience, I write to you, k...",test
1,989,0x2de201,"""Trust is not the same as faith. A friend is s...",test
2,66,0x218443,When do you have enough ? When are you satisfi...,test
3,104,0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test
4,310,0x26289a,"In these tough times, who do YOU turn to as yo...",test
...,...,...,...,...
411967,602,0x2913b4,"""For this is the message that ye heard from th...",test
411968,598,0x2a980e,"""There is a lad here, which hath five barley l...",test
411969,827,0x316b80,When you buy the last 2 tickets remaining for ...,test
411970,368,0x29d0cb,I swear all this hard work gone pay off one da...,test


#### Since we are still missing the "target", here I join the resulting "twitter_train" with the emotions_df, and drop the extra index.

In [23]:
final_twitter_df = twitter_train.merge(emotions_df, how="inner")
final_twitter_df = final_twitter_df.drop("index",axis = 1) 

In [24]:
final_twitter_df
# 1,455,563 rows

Unnamed: 0,_score,tweet_id,text,identification,emotion
0,391,0x376b20,"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,433,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train,sadness
2,376,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
3,120,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
4,1021,0x2c91a8,Still waiting on those supplies Liscus. <LH>,train,anticipation
...,...,...,...,...,...
1455558,94,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,train,joy
1455559,627,0x38959e,In every circumtance I'd like to be thankful t...,train,joy
1455560,274,0x2cbca6,there's currently two girls walking around the...,train,joy
1455561,840,0x24faed,"Ah, corporate life, where you can date <LH> us...",train,joy


#### 8 final unique emotions:

In [25]:
final_twitter_df.emotion.unique()

array(['anticipation', 'sadness', 'fear', 'joy', 'anger', 'trust',
       'disgust', 'surprise'], dtype=object)

## Deleting duplicated values

#### Another important preprocessing task is to delete duplicated values. Do we have duplicated values?

In [26]:
#Duplicated values:
print(sum(final_twitter_df.text.duplicated()))
print(sum(final_twitter_df.tweet_id.duplicated()))

3785
0


#### Count the table before deleting duplicates:

In [27]:
before_deleting_duplicates = len(final_twitter_df)
before_deleting_duplicates

1455563

#### These are the tweets that most likely were sent twice by the user by accident (Duplicated):

In [28]:
final_twitter_df[final_twitter_df.duplicated('text')].sort_values('text',axis=0)

Unnamed: 0,_score,tweet_id,text,identification,emotion
1238903,1018,0x2dde6b,"""A coward is incapable of exhibiting love; it ...",train,joy
1028251,899,0x2909e1,"""A family can develop only with a loving woman...",train,joy
397639,954,0x272113,"""A poem begins as a lump in the throat, a sens...",train,joy
590230,895,0x236961,"""A woman can be anything the man who loves her...",train,joy
1346054,682,0x22a8b0,"""Absence - that common cure of love."" <LH> <LH...",train,joy
...,...,...,...,...,...
1338823,577,0x338bed,“And he awoke and rebuked the wind and said to...,train,joy
747335,524,0x2e030b,“Everyone prays in the end” <LH> @samsmithworld,train,anticipation
1267175,565,0x24ca3c,"“Give thanks to the Lord, for he is good; his ...",train,trust
672023,131,0x22975e,"☀️Good Morning World ☀️ Rise-N-Grind, it's⌚to ...",train,trust


#### For example:

In [29]:
final_twitter_df.iloc[1238903]['text'], final_twitter_df.iloc[597627]['text']

('"A coward is incapable of exhibiting love; it is the prerogative of the brave." <LH> <LH> <LH> <LH>',
 '"A coward is incapable of exhibiting love; it is the prerogative of the brave." <LH> <LH> <LH> <LH>')

In [30]:
final_twitter_df.drop_duplicates(subset ="text",keep= "first", inplace = True)
print("Duplicates deleted: ", before_deleting_duplicates - len(final_twitter_df))

Duplicates deleted:  3785


In [31]:
final_twitter_df.reset_index(drop=True, inplace=True)

In [32]:
final_twitter_df.columns

Index(['_score', 'tweet_id', 'text', 'identification', 'emotion'], dtype='object')

#### Final Twitter DF (obtained until now)

In [33]:
final_twitter_df

Unnamed: 0,_score,tweet_id,text,identification,emotion
0,391,0x376b20,"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,433,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train,sadness
2,376,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
3,120,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
4,1021,0x2c91a8,Still waiting on those supplies Liscus. <LH>,train,anticipation
...,...,...,...,...,...
1451773,94,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,train,joy
1451774,627,0x38959e,In every circumtance I'd like to be thankful t...,train,joy
1451775,274,0x2cbca6,there's currently two girls walking around the...,train,joy
1451776,840,0x24faed,"Ah, corporate life, where you can date <LH> us...",train,joy


## Cleaning Text

#### Some futher preprocessing to be done is to take out unnecesary characters in the text, such as special characters.

In [34]:
import re
import string

In [36]:
def clean(text):
    text = text.lower()
    text = text.replace('<lh>',"")
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\n', '', text)
    return text

#### Here i applied the clean() function:

In [37]:
final_twitter_df.text = final_twitter_df.text.apply(clean)
final_twitter_df.iloc[0].text

'people who post add me on snapchat must be dehydrated cuz man thats '

#### Some rows only had special characters so now they are left empty.

In [38]:
final_twitter_df[final_twitter_df.text == ""]

Unnamed: 0,_score,tweet_id,text,identification,emotion
2173,234,0x260518,,train,joy
4157,215,0x38cea2,,train,trust
6598,970,0x255680,,train,joy
13234,332,0x343e48,,train,joy
15604,38,0x1ea04a,,train,joy
...,...,...,...,...,...
1423576,901,0x2f5326,,train,joy
1426238,309,0x332e48,,train,trust
1427308,804,0x2bff95,,train,joy
1434859,954,0x2cfe7f,,train,joy


#### Now i delete this empty new rows:

In [39]:
i = final_twitter_df[final_twitter_df.text == ""].index
final_twitter_df.drop(i, axis = 0, inplace = True)
final_twitter_df = final_twitter_df.reset_index()
final_twitter_df.drop(['index'],axis = 1, inplace = True)
final_twitter_df

Unnamed: 0,_score,tweet_id,text,identification,emotion
0,391,0x376b20,people who post add me on snapchat must be deh...,train,anticipation
1,433,0x2d5350,brianklaas as we see trump is dangerous to fre...,train,sadness
2,376,0x1cd5b0,now issa is stalking tasha 😂😂😂,train,fear
3,120,0x1d755c,riskshow thekevinallison thx for the best time...,train,joy
4,1021,0x2c91a8,still waiting on those supplies liscus,train,anticipation
...,...,...,...,...,...
1451432,94,0x321566,im so happy nowonder the name of this show hap...,train,joy
1451433,627,0x38959e,in every circumtance id like to be thankful to...,train,joy
1451434,274,0x2cbca6,theres currently two girls walking around the ...,train,joy
1451435,840,0x24faed,ah corporate life where you can date using ju...,train,joy


#### Stemmer

#### Besides cleaning the Text, i also applied a stemmer to reduce the amount of words. I created a new feature to preserve the original text with every word (run, running, ran, e.g.) if in case it is still needed.

In [40]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def stemming(sentence):
    ps = PorterStemmer()
    words = word_tokenize(sentence)
    res = []
    for w in words:
        res.append(ps.stem(w))
    return ' '.join(res)

In [41]:
%%time
final_twitter_df['text_stemmed'] = final_twitter_df.text.apply(stemming)
final_twitter_df

Wall time: 18min 56s


Unnamed: 0,_score,tweet_id,text,identification,emotion,text_stemmed
0,391,0x376b20,people who post add me on snapchat must be deh...,train,anticipation,peopl who post add me on snapchat must be dehy...
1,433,0x2d5350,brianklaas as we see trump is dangerous to fre...,train,sadness,brianklaa as we see trump is danger to freepre...
2,376,0x1cd5b0,now issa is stalking tasha 😂😂😂,train,fear,now issa is stalk tasha 😂😂😂
3,120,0x1d755c,riskshow thekevinallison thx for the best time...,train,joy,riskshow thekevinallison thx for the best time...
4,1021,0x2c91a8,still waiting on those supplies liscus,train,anticipation,still wait on those suppli liscu
...,...,...,...,...,...,...
1451432,94,0x321566,im so happy nowonder the name of this show hap...,train,joy,im so happi nowond the name of thi show happi ...
1451433,627,0x38959e,in every circumtance id like to be thankful to...,train,joy,in everi circumt id like to be thank to the al...
1451434,274,0x2cbca6,theres currently two girls walking around the ...,train,joy,there current two girl walk around the librari...
1451435,840,0x24faed,ah corporate life where you can date using ju...,train,joy,ah corpor life where you can date use just the...


#### Another Feature Creation I did, was to create categories (numerical based) on the 8 emotions available. This was because some models I tried needed the numbers as categories and not the text itself.

In [42]:
def categorize_emotions(emotion):
    if(emotion == "fear"):
        return 1
    if(emotion == "trust"):
        return 2
    if(emotion == "anger"):
        return 3
    if(emotion == "anticipation"):
        return 4
    if(emotion == "surprise"):
        return 5
    if(emotion == "disgust"):
        return 6
    if(emotion == "joy"):
        return 7
    if(emotion == "sadness"):
        return 8
    
final_twitter_df['Category'] = final_twitter_df.emotion.apply(lambda x: categorize_emotions(x))

In [43]:
final_twitter_df

Unnamed: 0,_score,tweet_id,text,identification,emotion,text_stemmed,Category
0,391,0x376b20,people who post add me on snapchat must be deh...,train,anticipation,peopl who post add me on snapchat must be dehy...,4
1,433,0x2d5350,brianklaas as we see trump is dangerous to fre...,train,sadness,brianklaa as we see trump is danger to freepre...,8
2,376,0x1cd5b0,now issa is stalking tasha 😂😂😂,train,fear,now issa is stalk tasha 😂😂😂,1
3,120,0x1d755c,riskshow thekevinallison thx for the best time...,train,joy,riskshow thekevinallison thx for the best time...,7
4,1021,0x2c91a8,still waiting on those supplies liscus,train,anticipation,still wait on those suppli liscu,4
...,...,...,...,...,...,...,...
1451432,94,0x321566,im so happy nowonder the name of this show hap...,train,joy,im so happi nowond the name of thi show happi ...,7
1451433,627,0x38959e,in every circumtance id like to be thankful to...,train,joy,in everi circumt id like to be thank to the al...,7
1451434,274,0x2cbca6,theres currently two girls walking around the ...,train,joy,there current two girl walk around the librari...,7
1451435,840,0x24faed,ah corporate life where you can date using ju...,train,joy,ah corpor life where you can date use just the...,7


#### Shuffle:
#### One of the last preprocessing tasks I performed was to shuffle the data, to avoid having 100 rows in a row which are "joys" (for example). This can mislead the models.

#### Transform the dataframe into arrays for shuffling:

In [44]:
unshuffle_data = final_twitter_df.iloc[:,:].to_numpy()

In [45]:
unshuffle_data

array([[391, '0x376b20',
        'people who post add me on snapchat must be dehydrated cuz man thats ',
        ..., 'anticipation',
        'peopl who post add me on snapchat must be dehydr cuz man that',
        4],
       [433, '0x2d5350',
        'brianklaas as we see trump is dangerous to freepress around the world what a   trumplegacy  cnn',
        ..., 'sadness',
        'brianklaa as we see trump is danger to freepress around the world what a trumplegaci cnn',
        8],
       [376, '0x1cd5b0', 'now issa is stalking tasha 😂😂😂 ', ..., 'fear',
        'now issa is stalk tasha 😂😂😂', 1],
       ...,
       [274, '0x2cbca6',
        'theres currently two girls walking around the library just handing out red bulls  blessyou',
        ..., 'joy',
        'there current two girl walk around the librari just hand out red bull blessyou',
        7],
       [840, '0x24faed',
        'ah corporate life where you can date  using just the relative anachronism of the last job title that u

In [46]:
final_twitter_df.text.shape[0] , unshuffle_data.shape[0]

(1451437, 1451437)

#### Shuffling:

In [47]:
first_shuffle_indices = np.arange(unshuffle_data.shape[0])
np.random.shuffle(first_shuffle_indices)

shuffle_data = unshuffle_data[first_shuffle_indices]
shuffle_data

array([[883, '0x292d69',
        'forever daddys little girl  when daddy goes looking for a loo in the mall for you even when youre  😅 ',
        ..., 'sadness',
        'forev daddi littl girl when daddi goe look for a loo in the mall for you even when your 😅',
        8],
       [391, '0x271dff',
        'the day before a big event and my team has everything complete before   goodjobteam ',
        ..., 'trust',
        'the day befor a big event and my team ha everyth complet befor goodjobteam',
        2],
       [507, '0x305ac3',
        'nikkinicolex i was going to do that this year 😂😩 ', ...,
        'sadness', 'nikkinicolex i wa go to do that thi year 😂😩', 8],
       ...,
       [564, '0x26cf91',
        ' if you are female  and over  dm for a great future     august   at ',
        ..., 'joy',
        'if you are femal and over dm for a great futur august at', 7],
       [353, '0x2c0ca1', 'tom priceisright gone ', ..., 'sadness',
        'tom priceisright gone', 8],
       [64

#### To return the array into a Dataframe:

In [48]:
final_twitter_df = pd.DataFrame(shuffle_data)
final_twitter_df.columns = ['_score', 'tweet_id', 'text', 'identification', 'emotion','text_stemmed','Category']
final_twitter_df

Unnamed: 0,_score,tweet_id,text,identification,emotion,text_stemmed,Category
0,883,0x292d69,forever daddys little girl when daddy goes lo...,train,sadness,forev daddi littl girl when daddi goe look for...,8
1,391,0x271dff,the day before a big event and my team has eve...,train,trust,the day befor a big event and my team ha every...,2
2,507,0x305ac3,nikkinicolex i was going to do that this year 😂😩,train,sadness,nikkinicolex i wa go to do that thi year 😂😩,8
3,560,0x2415c8,donaldtrump of obstruction of justice and i...,train,sadness,donaldtrump of obstruct of justic and interfer...,8
4,348,0x378a4c,is holy and just therefore he must hate and p...,train,anticipation,is holi and just therefor he must hate and pun...,4
...,...,...,...,...,...,...,...
1451432,237,0x326831,“what do you want to listen to babe” “i don’t...,train,sadness,“ what do you want to listen to babe ” “ i don...,8
1451433,294,0x2d0df5,shoutouts marvelspank onward and forward unloc...,train,joy,shoutout marvelspank onward and forward unlock...,7
1451434,564,0x26cf91,if you are female and over dm for a great f...,train,joy,if you are femal and over dm for a great futur...,7
1451435,353,0x2c0ca1,tom priceisright gone,train,sadness,tom priceisright gone,8


In [49]:
final_twitter_df[final_twitter_df.text_stemmed == ""]

Unnamed: 0,_score,tweet_id,text,identification,emotion,text_stemmed,Category
9558,387,0x3059cd,,train,joy,,7
16451,1009,0x298d27,,train,joy,,7
36660,146,0x2f0afd,,train,joy,,7
74907,899,0x372423,,train,joy,,7
131308,277,0x1f6397,,train,joy,,7
205430,526,0x1f1ba7,,train,anticipation,,4
218948,784,0x1e7cf5,,train,joy,,7
220297,164,0x36935d,,train,anticipation,,4
250434,857,0x2a7386,,train,sadness,,8
305456,855,0x3890a7,,train,joy,,7


In [50]:
i = final_twitter_df[final_twitter_df.text_stemmed == ""].index
final_twitter_df.drop(i, axis = 0, inplace = True)
final_twitter_df = final_twitter_df.reset_index()
final_twitter_df.drop(['index'],axis = 1, inplace = True)
final_twitter_df

Unnamed: 0,_score,tweet_id,text,identification,emotion,text_stemmed,Category
0,883,0x292d69,forever daddys little girl when daddy goes lo...,train,sadness,forev daddi littl girl when daddi goe look for...,8
1,391,0x271dff,the day before a big event and my team has eve...,train,trust,the day befor a big event and my team ha every...,2
2,507,0x305ac3,nikkinicolex i was going to do that this year 😂😩,train,sadness,nikkinicolex i wa go to do that thi year 😂😩,8
3,560,0x2415c8,donaldtrump of obstruction of justice and i...,train,sadness,donaldtrump of obstruct of justic and interfer...,8
4,348,0x378a4c,is holy and just therefore he must hate and p...,train,anticipation,is holi and just therefor he must hate and pun...,4
...,...,...,...,...,...,...,...
1451386,237,0x326831,“what do you want to listen to babe” “i don’t...,train,sadness,“ what do you want to listen to babe ” “ i don...,8
1451387,294,0x2d0df5,shoutouts marvelspank onward and forward unloc...,train,joy,shoutout marvelspank onward and forward unlock...,7
1451388,564,0x26cf91,if you are female and over dm for a great f...,train,joy,if you are femal and over dm for a great futur...,7
1451389,353,0x2c0ca1,tom priceisright gone,train,sadness,tom priceisright gone,8


#### Save the CSV before balancing:
#### Now that preprocessing has been done, I save the final dataframe to a CSV, to open it in another jupyter for applying different models.

# FINAL CSV
### This is the final csv that gave me the best result. This one is the one used in BOW+NN Test jupyter notebook

In [51]:
final_twitter_df.to_csv("final_dataset.csv",index=False)

#### Also save testing data into a CSV:

In [76]:
twitter_test.to_csv("twitter_test_data.csv",index=False)

# -------------------------------------------------------------

# -------------------------------------------------------------

# -------------------------------------------------------------

## From here on I did some other preprocessing for other tries that didn't improve the models, in fact, the models decrease accuracy!

#### Uploading another CSV but with only scores > 600. (Also tried with the lowest scores and made no difference)

In [52]:
final_twitter_df_200up = final_twitter_df[final_twitter_df["_score"]>=612]
final_twitter_df_200up

Unnamed: 0,_score,tweet_id,text,identification,emotion,text_stemmed,Category
0,883,0x292d69,forever daddys little girl when daddy goes lo...,train,sadness,forev daddi littl girl when daddi goe look for...,8
6,1016,0x2a6431,oh well another day of sitting in playing cod ...,train,sadness,oh well anoth day of sit in play cod all day,8
7,822,0x3047f3,pisstestme billybaldwin poor pete one of the l...,train,sadness,pisstestm billybaldwin poor pete one of the lo...,8
8,720,0x32d6aa,pelada is so worth watching do yourself a favo...,train,joy,pelada is so worth watch do yourself a favor a...,7
10,756,0x309d82,my mom is such a kid when it comes to keeping...,train,surprise,my mom is such a kid when it come to keep gift...,5
...,...,...,...,...,...,...,...
1451378,1011,0x2da6a2,heres my goal make you my one and only thulip,train,anticipation,here my goal make you my one and onli thulip,4
1451379,764,0x26b040,coachjeffleach wait or you on your phone drivi...,train,sadness,coachjeffleach wait or you on your phone drive...,8
1451380,822,0x2777f4,zukuofficial really am home alone n still you...,train,anger,zukuoffici realli am home alon n still youtub ...,3
1451382,1003,0x2008c8,we have to love like they’re going to be gone ...,train,joy,we have to love like they ’ re go to be gone t...,7


In [53]:
final_twitter_df_200up.to_csv("greater_than_some_score.csv",index=False)

## Balancing the dataset:

#### I also tried balancing the data so every emotion had the same amount of objects/rows, because theoretically it is better to have the data balanced so the models can equally learn to predict each emotion. In practice though, since this makes me eliminate so many rows (about 1 million), in this specific case it is not efficient to balance this dataset. The accuracy decreased when I did this.

In [54]:
emotions = final_twitter_df['emotion'].unique()
for emotion in emotions:
    number = len(final_twitter_df[final_twitter_df['emotion']==emotion])
    porcentage = (number/len(final_twitter_df))*100
    print(f"{emotion} has {number} of elements, making it {porcentage:.2f}% of the dataset")

sadness has 193177 of elements, making it 13.31% of the dataset
trust has 204861 of elements, making it 14.11% of the dataset
anticipation has 248675 of elements, making it 17.13% of the dataset
joy has 513947 of elements, making it 35.41% of the dataset
surprise has 48205 of elements, making it 3.32% of the dataset
disgust has 138994 of elements, making it 9.58% of the dataset
fear has 63820 of elements, making it 4.40% of the dataset
anger has 39712 of elements, making it 2.74% of the dataset


#### As we can see the dataset in very unbalanced. This could make our final model to be more precise for the higher emotions and to not predict correctly the other emotions.
#### Here the balancing begins, and we balance it based on the lowest emotion (anger):

In [55]:
shuffle_data[100][4]

'sadness'

In [56]:
lowest_number = 39712  #this corresponds to the lowest number of items for any given emotion (this case anger)

number_anger, number_anticipation, number_sadness, number_fear = 0, 0, 0, 0
number_joy, number_trust, number_disgust, number_surprise = 0, 0, 0, 0
indices_to_remove = []

for i in range(shuffle_data.shape[0]):
    #anticipation:
    if(shuffle_data[i][4]=='anticipation'):
        number_anticipation+= 1
        if(number_anticipation>lowest_number):
            indices_to_remove.append(i)
    #Sadness:
    if(shuffle_data[i][4]=='sadness'):
        number_sadness+= 1
        if(number_sadness>lowest_number):
            indices_to_remove.append(i)
    #Fear
    if(shuffle_data[i][4]=='fear'):
        number_fear+= 1
        if(number_fear>lowest_number):
            indices_to_remove.append(i)
    #Joy
    if(shuffle_data[i][4]=='joy'):
        number_joy+= 1
        if(number_joy>lowest_number):
            indices_to_remove.append(i)
    #Anger
    if(shuffle_data[i][4]=='anger'):
        number_anger+= 1
        if(number_anger>lowest_number):
            indices_to_remove.append(i)
    #Trust
    if(shuffle_data[i][4]=='trust'):
        number_trust+= 1
        if(number_trust>lowest_number):
            indices_to_remove.append(i)
    #Disgust:
    if(shuffle_data[i][4]=='disgust'):
        number_disgust+= 1
        if(number_disgust>lowest_number):
            indices_to_remove.append(i)
    #Surprise:
    if(shuffle_data[i][4]=='surprise'):
        number_surprise+= 1
        if(number_surprise>lowest_number):
            indices_to_remove.append(i)

#### Now delete the "extra" indices we found, to make every emotion equal 39712 elements.

In [57]:
scaled_data = np.delete(shuffle_data,indices_to_remove,axis=0)

In [58]:
len(scaled_data)

317696

#### Some reshuffling to guarantee randomness between rows and targets.

In [59]:
#Reshuffle the data:
shuffle_indices = np.arange(scaled_data.shape[0])
np.random.shuffle(shuffle_indices)

re_shuffled_data = scaled_data[shuffle_indices]

In [60]:
len(re_shuffled_data)

317696

In [61]:
re_shuffled_data

array([[375, '0x245ac3',
        'agar follwed back de h dia hai mane to dm ne ana farz nhi hai ap logo par  all new followers 🙏',
        ..., 'anticipation',
        'agar follw back de h dia hai mane to dm ne ana farz nhi hai ap logo par all new follow 🙏',
        4],
       [783, '0x225194',
        'just had this same convo about giving head as issarae  her girls i feel vindicated 😂😂  ',
        ..., 'fear',
        'just had thi same convo about give head as issara her girl i feel vindic 😂😂',
        1],
       [102, '0x3741f5',
        'you know what’s amusing getting caught up on twitter from the first couple of innings of the redsox game 🙃😂🙄 yeoflittlefaith ',
        ..., 'anticipation',
        'you know what ’ s amus get caught up on twitter from the first coupl of inning of the redsox game 🙃😂🙄 yeoflittlefaith',
        4],
       ...,
       [531, '0x1dc1b2',
        'for every minute you are  you lose  seconds of happiness', ...,
        'anger', 'for everi minut you are 

#### Return the data from an array to a Dataframe.

In [91]:
final_df = pd.DataFrame(re_shuffled_data)
final_df.columns = ['_score', 'tweet_id', 'text', 'identification', 'emotion','text_stemmed','Category']
final_df

Unnamed: 0,_score,tweet_id,text,identification,emotion,text_stemmed,Category
0,375,0x245ac3,agar follwed back de h dia hai mane to dm ne a...,train,anticipation,agar follw back de h dia hai mane to dm ne ana...,4
1,783,0x225194,just had this same convo about giving head as ...,train,fear,just had thi same convo about give head as iss...,1
2,102,0x3741f5,you know what’s amusing getting caught up on t...,train,anticipation,you know what ’ s amus get caught up on twitte...,4
3,82,0x20505c,conwaysean keep it up hopefully its not to hot...,train,joy,conwaysean keep it up hope it not to hot in 🇪🇸...,7
4,819,0x1cbb76,nigga i dont need no vitamin c i be smoking br...,train,trust,nigga i dont need no vitamin c i be smoke broc...,2
...,...,...,...,...,...,...,...
317691,856,0x2cc988,only the young bucks can superkick the spiri...,train,sadness,onli the young buck can superkick the spirit o...,8
317692,224,0x2c9bb6,jugzillavi i wish,train,anticipation,jugzillavi i wish,4
317693,531,0x1dc1b2,for every minute you are you lose seconds of...,train,anger,for everi minut you are you lose second of happi,3
317694,543,0x20d3fc,have a safe and happy thanksgiving,train,trust,have a safe and happi thanksgiv,2


#### Saving to CSV the final dataset (balanced). This was not ultimately used for the final upload.

In [92]:
final_df.to_csv("preprocessed_data.csv",index=False)