In [40]:
# Import data processing libraries
import pandas as pd

# Import NLP libraries
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import PorterStemmer

# Import functions from local scripts
import sys
sys.path.insert(1, './scripts/development')
import scripts.development.preprocessing as pre

In [None]:
# Determine file encoding of dataset
# Only need to be run once

#get_file_encoding("../data/twitter16m.csv")

# Detector gave me utf-8 encoding, which gives an error, so I will use the value from the tutorial

In [2]:
# Read the dataset from the csv file and check it
# Using names= to give meaningful names to columns
twitter_df = pd.read_csv("../data/twitter16m.csv", encoding="latin1", header=None, names=["sentiment", "id", "date", "query_type", "author", "message"])

In [3]:
twitter_df.head()

Unnamed: 0,sentiment,id,date,query_type,author,message
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Keep only the data we are interested in
twitter_df = twitter_df[["sentiment", "message"]]
twitter_df.head()

Unnamed: 0,sentiment,message
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
# Check what kind of sentiments we have
twitter_df['sentiment'].value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

### Word counts

In [6]:
# Get the word count for every message
twitter_df.loc[:, "word_count"] = twitter_df["message"].apply(lambda x: len(x.split()))

In [7]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19
1,0,is upset that he can't update his Facebook by ...,21
2,0,@Kenichan I dived many times for the ball. Man...,18
3,0,my whole body feels itchy and like its on fire,10
4,0,"@nationwideclass no, it's not behaving at all....",21


### Character count

In [8]:
# Get the character count for every message
twitter_df.loc[:, "character_count"] = twitter_df["message"].apply(lambda x: len(x))

In [9]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115
1,0,is upset that he can't update his Facebook by ...,21,111
2,0,@Kenichan I dived many times for the ball. Man...,18,89
3,0,my whole body feels itchy and like its on fire,10,47
4,0,"@nationwideclass no, it's not behaving at all....",21,111


### Average word length

In [10]:
# Get the average word length for every message
twitter_df.loc[:, "avg_word_len"] = twitter_df["message"].apply(lambda x: pre.get_avg_word_len(x))

In [11]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444
3,0,my whole body feels itchy and like its on fire,10,47,3.7
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714


### Stop words count

In [12]:
# Get the number of stop words in each message
twitter_df.loc[:, "stop_words_count"] = twitter_df["message"].apply(lambda x: len([word for word in x.lower().split() if word in STOP_WORDS]))

In [13]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632,6
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714,9
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444,9
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11


### #HashTags count

In [14]:
# Get the number of hashtags in each message
twitter_df.loc[:, "hashtags_count"] = twitter_df["message"].apply(lambda x: pre.get_hashtags_count(x))

In [15]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632,6,0
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714,9,0
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444,9,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0


### Numeric counts

In [16]:
# Get the number of numerical text in each message
twitter_df.loc[:, "numeric_count"] = twitter_df["message"].apply(
    lambda x: len([word for word in x.split() if word.isnumeric()]))

In [17]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632,6,0,0
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714,9,0,0
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444,9,0,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,0


### UPPER case words count

In [18]:
# Get the number of upper case words in each message
twitter_df.loc[:, "upper_case_count"] = twitter_df["message"].apply(lambda x: pre.get_upper_case_count(x))

In [19]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",19,115,5.052632,6,0,0,0
1,0,is upset that he can't update his Facebook by ...,21,111,4.285714,9,0,0,0
2,0,@Kenichan I dived many times for the ball. Man...,18,89,3.944444,9,0,0,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,0,0


### Randoms check to see if preprocessing is ok so far

In [20]:
print(twitter_df.loc[632]["message"], "\n\n", twitter_df.loc[632])

@raymondroman oh noes. how did you manage to send something to the trash can AND empty it without noticing! i feel for ya  

 sentiment                                                           0
message             @raymondroman oh noes. how did you manage to s...
word_count                                                         23
character_count                                                   122
avg_word_len                                                 4.304348
stop_words_count                                                   14
hashtags_count                                                      0
numeric_count                                                       0
upper_case_count                                                    0
Name: 632, dtype: object


## Data cleaning and preprocessing

### Lowercase conversion

In [21]:
# Convert the whole message to lowercase
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: x.lower())

In [None]:
twitter_df.head()

### Contraction to expansion

In [None]:
%%time
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.expand_contractions(x))

In [22]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count
0,0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",19,115,5.052632,6,0,0,0
1,0,is upset that he can't update his facebook by ...,21,111,4.285714,9,0,0,0
2,0,@kenichan i dived many times for the ball. man...,18,89,3.944444,9,0,0,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,0,0


### Count and remove emails

In [23]:
# Get the number of emails for each message
twitter_df.loc[:, "email_count"] = twitter_df["message"].apply(lambda x: pre.get_email_count(x))

In [24]:
# Remove the emails from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_emails(x))

In [25]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count
0,0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",19,115,5.052632,6,0,0,0,0
1,0,is upset that he can't update his facebook by ...,21,111,4.285714,9,0,0,0,0
2,0,@kenichan i dived many times for the ball. man...,18,89,3.944444,9,0,0,0,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,0,0,0


### Count and remove URLs

In [26]:
# Count the number of urls in the message
twitter_df["url_count"] = twitter_df["message"].apply(lambda x: pre.get_url_count(x))

In [27]:
# Remove urls from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_urls(x))

In [28]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count,url_count
0,0,"@switchfoot - awww, that's a bummer. you sho...",19,115,5.052632,6,0,0,0,0,1
1,0,is upset that he can't update his facebook by ...,21,111,4.285714,9,0,0,0,0,0
2,0,@kenichan i dived many times for the ball. man...,18,89,3.944444,9,0,0,0,0,0
3,0,my whole body feels itchy and like its on fire,10,47,3.7,5,0,0,0,0,0
4,0,"@nationwideclass no, it's not behaving at all....",21,111,4.285714,11,0,0,0,0,0


### Remove RT = Retweet, count and remove reply targets and mentions

In [29]:
# Remove RT from the messages
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_retweet(x))

In [30]:
# Count mentions in the message
twitter_df["mention_count"] = twitter_df["message"].apply(lambda x: pre.get_mention_count(x))

In [31]:
# Remove all mentions from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_mentions(x))

In [32]:
# See if tweet is reply or not
twitter_df["is_reply"] = twitter_df["message"].apply(lambda x: pre.is_reply(x))

In [33]:
# Remove reply target from message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_reply_target(x))

### Remove accents

In [34]:
# Replace accented characters with their normal form
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_accents(x))

### Replace emoticons

In [35]:
# Replace emoticons in text with words that represent the emoticons
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.replace_emoticons(x))

### Remove special characters from text, single characters and stop words

In [36]:
%%time
# Remove all characters that are not alphanumeric, hyphen or space from message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_special_characters(x))

CPU times: total: 6.84 s
Wall time: 6.85 s


In [37]:
# Remove extra spaces from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_single_characters(x))

In [38]:
%%time
# Remove stop words from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: " ".join([word for word in x.split() if word not in STOP_WORDS]))

CPU times: total: 4.94 s
Wall time: 4.98 s


In [39]:
twitter_df.head()

Unnamed: 0,sentiment,message,word_count,character_count,avg_word_len,stop_words_count,hashtags_count,numeric_count,upper_case_count,email_count,url_count,mention_count,is_reply
0,0,awww bummer shoulda got david carr day wink_smirk,19,115,5.052632,6,0,0,0,0,1,0,1
1,0,upset update facebook texting cry result schoo...,21,111,4.285714,9,0,0,0,0,0,0,0
2,0,dived times ball managed save 50 rest bounds,18,89,3.944444,9,0,0,0,0,0,0,1
3,0,body feels itchy like fire,10,47,3.7,5,0,0,0,0,0,0,0
4,0,behaving mad,21,111,4.285714,11,0,0,0,0,0,0,1


### Stemming the words

In [41]:
%%time
# Remove stop words from the message
stemmer = PorterStemmer()
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

CPU times: total: 4min 37s
Wall time: 4min 39s


In [42]:
twitter_df.to_csv("../data/preprocessed_step_1.csv")