In [90]:
# Import data processing libraries
import pandas as pd
import numpy as np

#Import text processing libraries
import re
import urllib

# Import NLP libraries
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import contractions

# Import functions from local scripts
import sys
sys.path.insert(1, './scripts/development')

import scripts.development.preprocessing as pre

In [77]:
# Determine file encoding of dataset
# Only need to be run once

#get_file_encoding("../data/twitter16m.csv")

# Detector gave me utf-8 encoding, which gives an error, so I will use the value from the tutorial

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

In [4]:
# Read the dataset from the csv file and check it
# Using names= to give meaningful names to columns
twitter_df = pd.read_csv("../data/twitter16m.csv", encoding="latin1", header=None, names=["sentiment", "id", "date", "query_type", "author", "message"])

In [5]:
twitter_df.head()

Unnamed: 0,sentiment,id,date,query_type,author,message
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
# Keep only the data we are interested in
twitter_df = twitter_df[["sentiment", "message"]]
twitter_df.head()

Unnamed: 0,sentiment,message
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [7]:
# Check what kind of sentiments we have
twitter_df['sentiment'].value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

### Word counts

In [8]:
# Get the word count for every message
twitter_df.loc[:, "word_count"] = twitter_df["message"].apply(lambda x: len(x.split()))

In [9]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19
1,0,is upset that he can't update his Facebook by ...,21
2,0,@Kenichan I dived many times for the ball. Man...,18
3,0,my whole body feels itchy and like its on fire,10
4,0,"@nationwideclass no, it's not behaving at all....",21


### Character count

In [10]:
# Get the character count for every message
twitter_df.loc[:, "character_count"] = twitter_df["message"].apply(lambda x: len(x))

In [11]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115
1,0,is upset that he can't update his Facebook by ...,21,111
2,0,@Kenichan I dived many times for the ball. Man...,18,89
3,0,my whole body feels itchy and like its on fire,10,47
4,0,"@nationwideclass no, it's not behaving at all....",21,111


### Average word length

In [13]:
# Get the average word length for every message
twitter_df.loc[:, "avg_word_len"] = twitter_df["message"].apply(lambda x: pre.get_avg_word_len(x))

In [14]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444
3,0,my whole body feels itchy and like its on fire,10,47,3.7
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714


### Stop words count

In [15]:
# Get the number of stop words in each message
twitter_df.loc[:, "stop_words_count"] = twitter_df["message"].apply(lambda x: len([word for word in x.lower().split() if word in STOP_WORDS]))

In [16]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632,6
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714,9
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444,9
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11


### #HashTags and @Mentions counts

In [17]:
# Get the number of hashtags in each message
twitter_df.loc[:, "hashtags_count"] = twitter_df["message"].apply(lambda x: pre.get_number_of_hashtags(x))

In [18]:
# Get the number of mentions in each message
twitter_df.loc[:, "mentions_count"] = twitter_df["message"].apply(lambda x: pre.get_number_of_mentions(x))

In [19]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,mentions_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632,6,0,1
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714,9,0,0
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444,9,0,1
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,1


### Numeric counts

In [20]:
# Get the number of numerical text in each message
twitter_df.loc[:, "numeric_count"] = twitter_df["message"].apply(
    lambda x: len([word for word in x.split() if word.isnumeric()]))

In [21]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,mentions_count,numeric_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632,6,0,1,0
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714,9,0,0,0
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444,9,0,1,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,1,0


### UPPER case words count

In [22]:
# Get the number of upper case words in each message
twitter_df.loc[:, "upper_case_count"] = twitter_df["message"].apply(
    lambda x: len([word for word in x.split() if word.isupper() and len(word) > 3]))

In [23]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,mentions_count,numeric_count,upper_case_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632,6,0,1,0,0
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714,9,0,0,0,0
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444,9,0,1,0,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,1,0,0


### Randoms check to see if preprocessing is ok so far

In [24]:
print(twitter_df.loc[632]["message"], "\n\n", twitter_df.loc[632])

@raymondroman oh noes. how did you manage to send something to the trash can AND empty it without noticing! i feel for ya  

 sentiment                                                           0
message             @raymondroman oh noes. how did you manage to s...
word_count                                                         23
character_count                                                   122
avg_word_len                                                 4.304348
stop_words_count                                                   14
hashtags_count                                                      0
mentions_count                                                      1
numeric_count                                                       0
upper_case_count                                                    0
Name: 632, dtype: object


## Data cleaning and preprocessing

### Lowercase conversion

In [25]:
# Convert the whole message to lowercase
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: x.lower())

In [26]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,mentions_count,numeric_count,upper_case_count
0,0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",19,115,5.052632,6,0,1,0,0
1,0,is upset that he can't update his facebook by ...,21,111,4.285714,9,0,0,0,0
2,0,@kenichan i dived many times for the ball. man...,18,89,3.944444,9,0,1,0,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,1,0,0


### Contraction to expansion

In [27]:
text = "I'll call you mum. Let's have a meeting 'n see what's going on. I can't say I'm gonna be home soon. I shan't be there. Cheers m'lady"


In [30]:
## TODO bring this function to a python script and test it as well
new_dict = contractions.contractions_dict
new_dict[" u "] = " you "
new_dict [" n "] = " and "
new_dict [" 'n "] = " and "
new_dict[" w "] = " with "
new_dict[" w/ "] = " with "
new_dict[" ur "] = " your "
new_dict[" nah "] = " no "

new_dict = {k.lower(): v.lower() for k,v in new_dict.items()}

def contraction_to_expression(text):
    if type(text) is str:
        for key in new_dict.keys():
            value = new_dict[key]
            text = text.replace(key, value)
    return text

In [31]:
contraction_to_expression(text.lower())

"i will call you mum. let us have a meeting and see what is going on. i cannot say i am going to be home soon. i shall not be there. cheers m'lady"

In [32]:
%%time
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: contraction_to_expression(x))

CPU times: total: 1min 54s
Wall time: 1min 58s


In [33]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,mentions_count,numeric_count,upper_case_count
0,0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",19,115,5.052632,6,0,1,0,0
1,0,is upset that he cannot update his facebook by...,21,111,4.285714,9,0,0,0,0
2,0,@kenichan i dived many times for the ball. man...,18,89,3.944444,9,0,1,0,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0,0
4,0,"@nationwideclass no, it is not behaving at all...",21,111,4.285714,11,0,1,0,0


### Count and remove emails

In [35]:
# Get the number of emails for each message
twitter_df.loc[:, "email_count"] = twitter_df["message"].apply(lambda x: len(re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x)))

In [37]:
twitter_df.loc[twitter_df["email_count"] > 0].head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,mentions_count,numeric_count,upper_case_count,email_count
4054,0,i want a new laptop. hp tx2000 is the bomb. :...,20,103,4.15,8,0,0,0,1,1
7917,0,who stole elledell@gmail.com?,3,31,9.0,1,0,0,0,0,1
8496,0,@alexistehpom really? did you send out all th...,20,130,5.5,11,0,1,0,0,1
10290,0,@laureystack awh...that is kind of sad lol ad...,8,76,8.5,0,0,1,0,0,1
16413,0,"@jilliancyork got 2 bottom of it, human error...",21,137,5.428571,7,0,1,1,0,1


In [38]:
text = "hi email is jojojo@rorororo.dashdgashd"
re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', text)

'hi email is '

In [39]:
# Remove the emails from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '', x))

### Count and remove URLs

In [50]:
text = "@switchfoot http://twitpic.com/2y1zl - awww, t..."

In [51]:
re.findall(r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', text)

[('http', 'twitpic.com', '/2y1zl')]

In [52]:
re.sub(r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', '', text)

'@switchfoot  - awww, t...'

In [47]:
# Count the number of urls in the message
twitter_df["url_count"] = twitter_df["message"].apply(lambda x: len(re.findall(r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', text)))

In [53]:
# Remove urls from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: re.sub(r'(http|ftp|https):\/\/([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', '', x))

In [54]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,mentions_count,numeric_count,upper_case_count,email_count,url_count
0,0,"@switchfoot - awww, that is a bummer. you sh...",19,115,5.052632,6,0,1,0,0,0,1
1,0,is upset that he cannot update his facebook by...,21,111,4.285714,9,0,0,0,0,0,1
2,0,@kenichan i dived many times for the ball. man...,18,89,3.944444,9,0,1,0,0,0,1
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0,0,0,1
4,0,"@nationwideclass no, it is not behaving at all...",21,111,4.285714,11,0,1,0,0,0,1


### Remove RT = Retweet and count and remove mentions

In [66]:
# Remove RT from the messages
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: re.sub(r'( )rt( @)', r'\1\2', x))

In [70]:
# Count mentions
twitter_df["mention_count"] = twitter_df["message"].apply(lambda x: len(re.findall(r'', x)))