In [1]:
# Import data processing libraries
import pandas as pd
import numpy as np

#Import text processing libraries
import re
import urllib

# Import NLP libraries
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import contractions

# Import functions from local scripts
import sys
sys.path.insert(1, './scripts/development')
import scripts.development.preprocessing as pre



In [None]:
# Determine file encoding of dataset
# Only need to be run once

#get_file_encoding("../data/twitter16m.csv")

# Detector gave me utf-8 encoding, which gives an error, so I will use the value from the tutorial

In [None]:
# Read the dataset from the csv file and check it
# Using names= to give meaningful names to columns
twitter_df = pd.read_csv("../data/twitter16m.csv", encoding="latin1", header=None, names=["sentiment", "id", "date", "query_type", "author", "message"])

In [None]:
twitter_df.head()

In [None]:
# Keep only the data we are interested in
twitter_df = twitter_df[["sentiment", "message"]]
twitter_df.head()

In [None]:
# Check what kind of sentiments we have
twitter_df['sentiment'].value_counts()

### Word counts

In [None]:
# Get the word count for every message
twitter_df.loc[:, "word_count"] = twitter_df["message"].apply(lambda x: len(x.split()))

In [None]:
twitter_df.head()

### Character count

In [None]:
# Get the character count for every message
twitter_df.loc[:, "character_count"] = twitter_df["message"].apply(lambda x: len(x))

In [None]:
twitter_df.head()

### Average word length

In [None]:
# Get the average word length for every message
twitter_df.loc[:, "avg_word_len"] = twitter_df["message"].apply(lambda x: pre.get_avg_word_len(x))

In [None]:
twitter_df.head()

### Stop words count

In [None]:
# Get the number of stop words in each message
twitter_df.loc[:, "stop_words_count"] = twitter_df["message"].apply(lambda x: len([word for word in x.lower().split() if word in STOP_WORDS]))

In [None]:
twitter_df.head()

### #HashTags count

In [None]:
# Get the number of hashtags in each message
twitter_df.loc[:, "hashtags_count"] = twitter_df["message"].apply(lambda x: pre.get_hashtags_count(x))

In [None]:
twitter_df.head()

### Numeric counts

In [None]:
# Get the number of numerical text in each message
twitter_df.loc[:, "numeric_count"] = twitter_df["message"].apply(
    lambda x: len([word for word in x.split() if word.isnumeric()]))

In [None]:
twitter_df.head()

### UPPER case words count

In [None]:
# Get the number of upper case words in each message
twitter_df.loc[:, "upper_case_count"] = twitter_df["message"].apply(lambda x: pre.get_upper_case_count(x))

In [None]:
twitter_df.head()

### Randoms check to see if preprocessing is ok so far

In [None]:
print(twitter_df.loc[632]["message"], "\n\n", twitter_df.loc[632])

## Data cleaning and preprocessing

### Lowercase conversion

In [None]:
# Convert the whole message to lowercase
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: x.lower())

In [None]:
twitter_df.head()

### Contraction to expansion

In [None]:
%%time
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.expand_contractions(x))

In [None]:
twitter_df.head()

### Count and remove emails

In [None]:
# Get the number of emails for each message
twitter_df.loc[:, "email_count"] = twitter_df["message"].apply(lambda x: pre.get_email_count(x))

In [None]:
# Remove the emails from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_emails(x))

In [None]:
twitter_df.head()

### Count and remove URLs

In [None]:
# Count the number of urls in the message
twitter_df["url_count"] = twitter_df["message"].apply(lambda x: pre.get_url_count(x))

In [None]:
# Remove urls from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_urls(x))

In [None]:
twitter_df.head()

### Remove RT = Retweet, count and remove reply targets and mentions

In [None]:
# Remove RT from the messages
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_retweet(x))

In [None]:
# Count mentions in the message
twitter_df["mention_count"] = twitter_df["message"].apply(lambda x: pre.get_mention_count(x))

In [None]:
# Remove all mentions from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_mentions(x))

In [None]:
# See if tweet is reply or not
twitter_df["is_reply"] = twitter_df["message"].apply(lambda x: pre.is_reply(x))

In [None]:
# Remove reply target from message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_reply_target(x))

### Remove accents

In [None]:
# Replace accented characters with their normal form
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_accents(x))

### Remove special characters from text and extra spaces

In [None]:
# Remove all characters that are not alphanumeric, hyphen or space from message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_special_characters(x))

In [None]:
# Remove extra spaces from the message
twitter_df.loc[:, "message"] = twitter_df["message"].apply(lambda x: pre.remove_extra_spaces(x))