# Assignment 3: Sentiment Classification of Tweets
## Feature Preprocessing/Engineering

# Import Data
Import the raw twitter data for comparison between custom feature engineered

In [1]:
import pandas as pd
from ftfy import fix_encoding


def load_data(filename):
    """Load data from CSV file .

    Args:
        filename (str): filename/filepath of csv
    """
    # load dataset
    data = pd.read_csv(filename)
    data.columns = ['sentiment', 'tweet_id', 'tweet']

    # Split dataset into features, target labels, and tweet id
    # load train features
    features = data['tweet']

    # fixes encoding for emoji's for feature engineering
    features = features.apply(fix_encoding)

    # load labels
    labels = data['sentiment']

    # load tweet_id
    tweet_id = data['tweet_id']

    return features, labels, tweet_id

In [2]:
import numpy as np
from ast import literal_eval
from scipy.sparse import csr_matrix

# load full training data
full_train_f_raw, full_train_l, train_id = load_data("data/train_full.csv")

# load full development data
full_dev_f_raw, full_dev_l, dev_id = load_data("data/dev_full.csv")

# load full test data
full_test_f_raw, full_test_l, test_id = load_data("data/test_full.csv")

full_train_l = np.array(full_train_l)
full_dev_l = np.array(full_dev_l)
full_test_l = np.array(full_test_l)

assert len(full_train_f_raw)==len(full_train_l)
assert len(full_dev_f_raw)==len(full_dev_l)
assert len(full_test_f_raw)==len(full_test_l)


## Preprocessing the tweets / Feature Engineering
Data preprocessing is one of the critical steps in any machine learning project. It includes cleaning and formatting the data before feeding into a machine learning algorithm. For NLP, the preprocessing steps are comprised of the following tasks:
* Clean the tweets by removing:
  * lowercasing
  * urls & hyperlinks e.g. https, www, http
  * remove user @ reference
  * remove standalone #
  * remove numeric terms
* Tokenising the tweet
* Replace emoticons & emoji's with word/meaning
* Removing stop words (optional)
* Removing punctuations
* Lemmatise the tweet
* Reduce words with extra characters at the end such as 'happyyyy' to 'happy'

In [3]:
import re

# clean the tweets
def clean_tweet(tweet):
    """Clean up a tweet .

    Args:
        tweet (str): tweet

    Returns:
        str: cleaned tweet
    """
    # set the tweet to lower
    tweet.lower()

    # remove urls and hyperlinks
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)

    # remove user @ references
    tweet = re.sub(r'\@\w+','', tweet)

    # remove hash # sign from the word but don't remove the text after the #
    tweet = re.sub(r'#', '', tweet)

    # # remove numeric terms
    tweet = re.sub(r'[0-9]', '', tweet)

    return tweet.strip()


## Helper functions to replace emoticons and emoji to their description

In [4]:
# import libraries
import spacy
from emot.emo_unicode import EMOTICONS_EMO


def replace_emoticons(tweet, enable_emo):
    """Replace all emoticons in a tweet with their respective description.

    Args:
        tweet ([str]): tokenised tweet

    Returns:
        [str]: tokenised tweet that has been cleaned
    """

    if(enable_emo is False):
        # filter out all emoticons either when it's uppercase or lower
        tweet = [token for token in tweet
                    if token.upper() not in EMOTICONS_EMO
                        or token not in EMOTICONS_EMO]

    for token in tweet:
        # change to upper for emoticons such as XD, :O, :D
        if token.upper() in EMOTICONS_EMO:
            emo_word = re.sub(r'[^\w|s+]', "", EMOTICONS_EMO[token.upper()])
            tweet[tweet.index(token)] = emo_word.lower()

        # check lowercase one too such as :c, :o, :x
        elif token in EMOTICONS_EMO:
            emo_word = re.sub(r'[^\w|s+]', "", EMOTICONS_EMO[token])
            tweet[tweet.index(token)] = emo_word.lower()

    return tweet


def replace_emojis(tweet_doc, enable_emo):
    """Replace all emoji in a tweet with their respective description.

    Args:
        tweet_doc ([type]): [description]

    Returns:
        [type]: [description]
    """
    # copy the list of tokens into list of strings
    tweet  = [i.text for i in tweet_doc]

    # return tweet if no emoji
    if tweet_doc._.has_emoji is False:
        return tweet

    for i in range(len(tweet_doc)):
        # replace tweet with description of emoji if emoji is enabled
        # else remove the emoji
        if(tweet_doc[i]._.is_emoji and enable_emo):
            tweet[i] = tweet_doc[i]._.emoji_desc.replace(' ', '')
        elif(tweet_doc[i]._.is_emoji):
            tweet.remove(tweet_doc[i].text)

    return tweet

## Preprocessor function

In [5]:
# import libraries
import string
from emot.emo_unicode import EMOTICONS_EMO
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from emot.emo_unicode import EMOTICONS_EMO

# set the stopwords
stop_words = stopwords.words('english')

def preprocess_tweets(tweets, enable_emo=True, enable_stopwords=False):
    """Preprocess Tweet (feature enginerring).

    Args:
        tweets (pd.Dataframe)
        enable_emo (bool): Determines if tweets will process emojis and emoticons

    Returns:
        [str]: preprocessed tweets
    """

    # clean the tweet
    tweets = tweets.apply(clean_tweet)

    replacement = []

    # load the spacymoji nlp pipeline
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("emoji", first=True)

    # add tweets into the spacy pipeline
    for tweet in nlp.pipe(tweets,
        disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):

        # replace emojis with description
        tweet = replace_emojis(tweet, enable_emo)

        # replace emoticons with description
        tweet = replace_emoticons(tweet, enable_emo)

        # remove stopwords
        if(enable_stopwords):
            tweet = [w for w in tweet if not w in stop_words]

        # translator for replacing punctuation also adding empty space as punct
        translator = str.maketrans('', '', string.punctuation+' ')

        # remove punctuations
        tweet = [w.translate(translator) for w in tweet]
        tweet = list(filter(None, tweet))

        # remove single characters from tweet
        tweet = [w for w in tweet if len(w)>1]

        # lemmatise tweet
        lem = WordNetLemmatizer()
        tweet = [lem.lemmatize(w, pos='a') for w in tweet]

        # remove duplicate characters at the end of words such as happyyyyy to happy
        tweet = re.sub(r'(.*)\1{2,}', r'\1', ' '.join(tweet))

        replacement.append(tweet)

    return replacement

In [6]:
# Preprocess tweets/features on full data
train_f_emo = preprocess_tweets(full_train_f_raw)
dev_f_emo = preprocess_tweets(full_dev_f_raw)
test_f_emo = preprocess_tweets(full_test_f_raw)

train_f_no_emo = preprocess_tweets(full_train_f_raw, enable_emo=False)
dev_f_no_emo = preprocess_tweets(full_dev_f_raw, enable_emo=False)
test_f_no_emo = preprocess_tweets(full_test_f_raw, enable_emo=False)

train_f_sw = preprocess_tweets(full_train_f_raw, enable_stopwords=True)
dev_f_sw = preprocess_tweets(full_dev_f_raw, enable_stopwords=True)
test_f_sw = preprocess_tweets(full_test_f_raw, enable_stopwords=True)

print("finished preprocess")

finished preprocess


# Save Preprocessed Data

In [7]:
import pandas as pd

headers = ['sentiment', 'tweet_id', 'tweet']
file_exe = '.csv'
processed_data = {
    'train_f_emo':      [full_train_l, train_id, train_f_emo],
    'dev_f_emo':        [full_dev_l, dev_id, dev_f_emo],
    'test_f_emo':       [full_test_l, test_id, test_f_emo],

    'train_f_no_emo':   [full_train_l, train_id, train_f_no_emo],
    'dev_f_no_emo':     [full_dev_l, dev_id, dev_f_emo],
    'test_f_no_emo':    [full_test_l, test_id, test_f_no_emo],

    'train_f_sw':       [full_train_l, train_id, train_f_sw],
    'dev_f_sw':         [full_dev_l, dev_id, dev_f_sw],
    'test_f_sw':        [full_test_l, test_id, test_f_sw]
}

# export each preprocessed data into data
for data in processed_data:
    filename = 'data/' + data + file_exe
    df = pd.DataFrame(np.column_stack(processed_data[data]), columns=headers)
    df.to_csv(filename, index=False)