# Hate Speech Detection - PreProcessing

### Steps:
#### 1. Import dataset, convert tweet text from html ascii to unicode
#### 2. Preprocessing (ordered):
##### 3.1 remove whitespaces;
##### 3.2 remove urls;
##### 3.3 remove mentions;
##### 3.4 remove retweet (RT);
##### 3.5 Convert abbreviations to formal words
##### 3.6 convert contractions;
##### 3.7 convert emoticons to words OR remove emoticons
##### 3.8 convert emojis to words OR remove emojis
##### 3.9 convert hashtags to words OR remove hashtags
##### 3.10 remove punctuations
##### 3.11 lowercase letters
##### 3.12 spell checker (select 1 of 3 functions with different libraries)

#### 3. Split dataset into train and test
#### 4. EDA of original dataset

### Import dataset and split into train and test

In [None]:
# Load the dataset as pandas.dataframe object, set the first column as the index column
import pandas as pd
df = pd.read_csv("labeled_data.csv", index_col=0)

In [None]:
# Convert original tweets from ascii to unicode
import html
def convert_to_unicode(text):
    return html.unescape(text)

df['tweet'] = df['tweet'].apply(convert_to_unicode)

In [None]:
# Set column width for long text display
pd.set_option('display.max_colwidth', 350)

In [None]:
df.head(5)

In [None]:
# Visualize class distribution of dataset
import matplotlib.pyplot as plt
import seaborn as sns
fig, axes = plt.subplots(ncols=2, figsize=(8, 3), dpi=100)
plt.tight_layout()

HATESPEECH_TWEETS = df['class'] == 2
OFFENSIVE_TWEETS = df['class'] == 1
NEITHER_TWEETS = df['class'] == 0

df.groupby('class').count()['count'].plot(kind='pie', ax=axes[0], labels=['Neither', 'Offensive', 'HateSpeech'], subplots=True)
sns.countplot(x=df['class'], hue=df['class'], ax=axes[1])

axes[0].set_ylabel('')
axes[1].set_ylabel('')
axes[1].set_xticklabels(['Neither', 'Offensive', 'HateSpeech'])
axes[0].tick_params(axis='x', labelsize=6)
axes[0].tick_params(axis='y', labelsize=6)
axes[1].tick_params(axis='x', labelsize=6)
axes[1].tick_params(axis='y', labelsize=6)

axes[0].set_title('Target Distribution in dataset (24783 tweets)', fontsize=9)
axes[1].set_title('Target Count in dataset (24783 tweets)', fontsize=9)

plt.show()

In [None]:
# Record the meta features of original tweets
import string
import numpy as np
from wordcloud import STOPWORDS

df['word_count'] = df['tweet'].apply(lambda x: len(str(x).split()))
df['unique_word_count'] = df['tweet'].apply(lambda x: len(set(str(x).split())))
df['stop_word_count'] = df['tweet'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
df['mean_word_length'] = df['tweet'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df['char_count'] = df['tweet'].apply(lambda x: len(str(x)))
df['punctuation_count'] = df['tweet'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

In [None]:
# Count mentions, urls and hashtags
import re # regular expression library
url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
mention_regex = '@[\w\-]+'
hashtag_regex = '(?!&)#(?!\d\d)[\w-]+'

df['mention_count'] = df['tweet'].apply(lambda x: len(re.findall(mention_regex, x)))
df['url_count'] = df['tweet'].apply(lambda x: len(re.findall(url_regex, x)))
df['hashtag_count'] = df['tweet'].apply(lambda x: len(re.findall(hashtag_regex, x)))

In [None]:
# Visualize df before preprocessing
df.head(3)

## Preprocessing

In [None]:
# Remove whitespaces '\s+' by a real space
import re
def remove_whitespaces(text):
    space_regex = '\s+'
    text = re.sub(space_regex, ' ', text)
    return text

df['cleaned_tweet'] = df['tweet'].apply(remove_whitespaces)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# Remove urls
import re
def remove_urls(text):
    url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return re.sub(url_regex, '', text)
# Example:
text = "Beautiful color combination of pink, orange, yellow & white. A Coll http://t.co/H0dYEBvnZB"
print("Example")
print("before: ", text)
print("after : ", remove_urls(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_urls)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# Remove mentions
import re
def remove_mentions(text):
    mention_regex = '@[\w\-]+'
    return re.sub(mention_regex, '', text)
# Example:
text = "RT @C_G_Anderson: @viva_based she look like a tranny"
print("Example")
print("before: ", text)
print("after : ", remove_mentions(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_mentions)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# Remove 'RT'
import re
def remove_rt(text):
    return re.sub('RT','', text)

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_rt)
#df.head(5) # tips: uncomment this line to visualize result

#### Following function [convert_abbreviations_to_words] takes very long time!
##### Please use pickle function to save your preprocessed data (at the end)

In [None]:
# Convert abbreviations to formal words
import csv
import re
def convert_abbreviations_to_words(text):
    text = text.split(" ")
    index_i = 0
    for w in text:
        dictFileName = "slang_dict.txt"
        with open(dictFileName, "r") as dictFile:
            # Reading file as CSV with delimiter as "=", abbreviations are stored in row[0] and meanings in row[1]
            dict = csv.reader(dictFile, delimiter="=")
            # Remove special characters
            w = re.sub('[^a-zA-Z0-9-_.]', '', w)
            for row in dict:
                if w.upper() == row[0]:
                    text[index_i] = row[1]
            dictFile.close()
        index_i += 1
    return ' '.join(text)
# Example:
text = "idk afk idunno"
print("Example")
print("before: ", text)
print("after : ", convert_abbreviations_to_words(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(convert_abbreviations_to_words)
#df.head(5) # tips: uncomment this line to visualize result

In [1]:
# Convert informal contraction to formal writing 
# e.g. isn't -> is not, mayn't -> may not, she'd -> she would, etc.
import contractions # library for deal with informal contractions, installation: pip install contractions
def convert_contraction(text):
    return contractions.fix(text)
# Example:
text = "isn't mayn't she'd yall asap!"
print("Example")
print("before: ", text)
print("after : ", convert_contraction(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(convert_contraction)
#df.head(5) # tips: uncomment this line to visualize result

Example
before:  isn't mayn't she'd yall asap!
after :  is not may not she would you all as soon as possible!


### Please use one of following functions:
#### (1) convert emoticons to words
#### (2) remove emoticons

In [None]:
# Convert emoticons to words
import html
import emot # library to extract emojis and emoticons, installation: pip install emot
import re
def convert_emoticons_to_words(text):
    text = html.unescape(text) # convert text to unicode format
    remove_synonym_regex = '[\s][or][ \w-]+|,\s\w.+' # some emoticons have multiple meanings, keep the first meaning
    emoticons = emot.emoticons(text) # find all emoticons by using the library emot
    if len(emoticons) > 0 and "value" in emoticons and "mean" in emoticons: # not empty
        emoticons_values = emoticons['value']
        emoticons_meanings = emoticons['mean']
        for i in range(len(emoticons_values)):
            emoticons_meanings[i] = re.sub(remove_synonym_regex, ' ', emoticons_meanings[i])
            text = text.replace(emoticons_values[i], " "+emoticons_meanings[i]+" ")
            text = re.sub('\s+', ' ', text) # remove additional spaces
    return text
# Example:
text = "I love python :-):-(:/ yaya ho.. cute avi &#128553;"
text = html.unescape(text) # convert text to unicode format
print("Example")
print("before: ", text)
print("after : ", convert_emoticons_to_words(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(convert_emoticons_to_words)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# Remove emoticons
import html
import emot # library to extract emojis and emoticons, installation: pip install emot
import re
def remove_emoticons(text):
    text = html.unescape(text) # convert text to unicode format
    remove_synonym_regex = '[\s][or][ \w-]+|,\s\w.+' # some emoticons have multiple meanings, keep the first meaning
    emoticons = emot.emoticons(text) # find all emoticons by using the library emot
    if len(emoticons) > 0 and "value" in emoticons: # not empty
        emoticons_values = emoticons['value']
        for i in range(len(emoticons_values)):
            text = text.replace(emoticons_values[i], " ")
            text = re.sub('\s+', ' ', text) # remove additional spaces
    return text
# Example:
text = "I love python :-):-(:/ yaya ho.. cute avi &#128553;"
text = html.unescape(text) # convert text to unicode format
print("Example")
print("before: ", text)
print("after : ", remove_emoticons(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_emoticons)
#df.head(5) # tips: uncomment this line to visualize result

### Please use one of following functions:
#### (1) convert emojis to words
#### (2) remove emojis

In [None]:
# Convert emojis to words
import html
import emoji # library to convert emoji to text
def convert_emojis_to_words(text):
    text = emoji.demojize(html.unescape(text))
    #text = text.replace(":", " ")
    return ' '.join(text.split())
# Example :
text = "I love python :-):-(:/ yaya ho.. cute avi tho &#128553;"
text = html.unescape(text) # convert text to unicode format
print("Example")
print("before: ", text)
print("after : ", convert_emojis_to_words(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(convert_emojis_to_words)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# Remove emojis
import html
import emoji
def remove_emojis(text):
    return emoji.get_emoji_regexp().sub(u'', html.unescape(text))
# Example :
text = "I love python :-):-(:/ yaya ho.. cute avi tho RT @ViVaLa_Ari I had no idea she was sleep &#128553;"
text = html.unescape(text) # convert text to unicode format
print("Example")
print("before: ", text)
print("after : ", remove_emojis(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_emojis)
#df.head(5) # tips: uncomment this line to visualize result

### Please use one of following functions:
#### (1) convert hashtags to words
#### (2) remove hashtags

In [2]:
# Convert hashtags to words
import re
import enchant
import wordninja # library to split text into list of words, installation: pip install wordninja

dict_UK = enchant.Dict('en_UK')
dict_US = enchant.Dict('en_US')
def convert_hashtags_to_words(text):
    hashtag_regex = '(?!&)#(?!\d\d)[\w-]+'
    hashtags = re.findall(hashtag_regex, text)
    for tag in hashtags:
        cleantag = tag[1:]
        if dict_US.check(cleantag) or dict_UK.check(cleantag):
            text = re.sub(tag, cleantag, text)
            pass
        else:
            hashtagSplit = wordninja.split(tag)
            hashtagText = " ".join(hashtagSplit)
            text = re.sub(tag, hashtagText, text)
    return text
# Example:
text = "#thisisthelongesteverhashtagiwouldimagineseeingthatitcanbeaslongasiwant"
print("Example")
print("before: ", text)
print("after : ", convert_hashtags_to_words(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(convert_hashtags_to_words)
#df.head(5) # tips: uncomment this line to visualize result

Example
before:  #thisisthelongesteverhashtagiwouldimagineseeingthatitcanbeaslongasiwant
after :  this is the longest ever hash tagi would imagine seeing that it can be as long as i want


In [None]:
# Remove hashtags
import re
def remove_hashtags(text):
    hashtag_regex = '(?!&)#(?!\d\d)[\w-]+'
    text = re.sub(hashtag_regex, ' ', text)
    text = re.sub('\s+', ' ', text) # remove additional spaces
    return text
# Example :
text = "kk #lol #whatyoudo;"
print("Example")
print("before: ", text)
print("after : ", remove_hashtags(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_hashtags)
#df.head(5) # tips: uncomment this line to visualize result

### Remove punctuations and lowercase letters

In [None]:
# Remove punctuations ". , ! ? : ; - ="
def remove_punctuations(text):
    punc_regex = '[\.\,\!\?\:\;\-\=]'
    return re.sub(punc_regex, ' ', text)

df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_punctuations)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# Lowercase letters
df['cleaned_tweet'] = df['cleaned_tweet'].str.lower()
#df.head(5) # tips: uncomment this line to visualize result

## Removing stopwords can be used before spellchecker, it reduces dimensionality of vocabulary
### for benchmark, please try to use this function at the end of preprocessing, because spellchecker might correct misspelling words to stopwords

In [3]:
# Remove stopwords
from nltk.corpus import stopwords
def remove_stopwords(text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words)
# Example:
text = "I wanna be a good student in my class"
print("Example")
print("before: ", text)
print("after : ", remove_stopwords(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(remove_stopwords)
#df.head(5) # tips: uncomment this line to visualize result

Example
before:  I wanna be a good student in my class
after :  wanna good student class


### Please use one of following functions:
#### (1) spellchecker with pyspellchecker
#### (2) spellchecker with TextBlob
#### (3) spellchecker with autocorrect

In [None]:
# SpellChecker 1: pyspellchecker
from spellchecker import SpellChecker # installation: pip install pyspellchecker
def spellChecker_pyspellchecker(text):
    spell = SpellChecker()
    words = spell.split_words(text)
    text = [spell.correction(word) for word in words]
    return ' '.join(text)
# Example:
text = "I havv goood speling int aanglish lol idunno cmputr spellechecke"
print("Example")
print("before: ", text)
print("after : ", spellChecker_pyspellchecker(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(spellChecker_pyspellchecker)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# SpellChecker 2: TextBlob
# for more information: https://textblob.readthedocs.io/en/dev/quickstart.html
from textblob import TextBlob # installation: pip install textblob
def spellChecker_TextBlob(text):
    spell = TextBlob(text)
    return spell.correct()
# Example:
text = "I havv goood speling int aanglish lol idunno cmputr spellechecke"
print("Example")
print("before: ", text)
print("after : ", spellChecker_TextBlob(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(spellChecker_TextBlob)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# SpellChecker 3: autocorrect
from autocorrect import Speller # installation: pip install autocorrect
def spellChecker_autocorrect(text):
    spell = Speller(lang='en')
    return spell(text)
# Example:
text = "I havv goood speling int aanglish lol idunno cmputr spellechecke"
print("Example")
print("before: ", text)
print("after : ", spellChecker_autocorrect(text))

df['cleaned_tweet'] = df['cleaned_tweet'].apply(spellChecker_autocorrect)
#df.head(5) # tips: uncomment this line to visualize result

### Please use one of following functions:
#### (1) stemmer with Porter : basic
#### (2) stemmer with Snowball (Porter2) : recommended
#### (3) stemmer with Lancaster : aggresive

In [None]:
# Stemmer 1: Porter
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
def stemmer_porter(text):
    st = PorterStemmer()
    words = text.split()
    text = [st.stem(word) for word in words]
    return ' '.join(text)
# Example:
text = "Such an analysis can reveal features that are not easily visible from the variations in the individual genes."
print("Example")
print("before: ", text)
print("after : ", stemmer_porter(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(stemmer_porter)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# Stemmer 2: Snowball (Porter2)
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
def stemmer_snowball(text):
    st = SnowballStemmer("english", ignore_stopwords=True)
    words = text.split()
    text = [st.stem(word) for word in words]
    return ' '.join(text)
# Example:
text = "Such an analysis can reveal features that are not easily visible from the variations in the individual genes."
print("Example")
print("before: ", text)
print("after : ", stemmer_snowball(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(stemmer_snowball)
#df.head(5) # tips: uncomment this line to visualize result

In [None]:
# Stemmer 3: Lancaster
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
def stemmer_lancaster(text):
    st = LancasterStemmer()
    words = text.split()
    text = [st.stem(word) for word in words]
    return ' '.join(text)
# Example:
text = "Such an analysis can reveal features that are not easily visible from the variations in the individual genes."
print("Example")
print("before: ", text)
print("after : ", stemmer_lancaster(text))

#df['cleaned_tweet'] = df['cleaned_tweet'].apply(stemmer_lancaster)
#df.head(5) # tips: uncomment this line to visualize result

### Save cleaned dataframe into pickle file

In [None]:
import datetime
now = datetime.datetime.now()
#print ("Current date and time : ")
currentTime = now.strftime("%Y%m%d-%H%M%S")
pickleName = "./" + currentTime + "_cleanedDataset_.pkl"
df.to_pickle(pickleName)

### Read saved pickle file

In [None]:
import pandas as pd
unpickled_df = pd.read_pickle(pickleName)

### Divide df into train and test

In [None]:
# Divide the dataset into train (80%) and test (20%)
df_train = df.sample(frac=0.8)
df_test = df.drop(df_train.index)
print("Shape of training and testing dataset: ", df_train.shape, df_test.shape)

In [None]:
# Visualize META FEATURES
import seaborn as sns
import matplotlib.pyplot as plt
METAFEATURES = ['word_count', 'unique_word_count', 'stop_word_count', 
                'mean_word_length', 'char_count', 'punctuation_count']

HATESPEECH_TWEETS = df_train['class'] == 2
OFFENSIVE_TWEETS = df_train['class'] == 1
NEITHER_TWEETS = df_train['class'] == 0

fig, axes = plt.subplots(ncols=2, nrows=len(METAFEATURES), figsize=(12, 18), dpi=100)

for i, feature in enumerate(METAFEATURES):
    sns.distplot(df_train.loc[HATESPEECH_TWEETS][feature], label='HateSpeech', ax=axes[i][0], color='red')
    sns.distplot(df_train.loc[OFFENSIVE_TWEETS][feature], label='Offensive', ax=axes[i][0], color='yellow')
    sns.distplot(df_train.loc[NEITHER_TWEETS][feature], label='Neither', ax=axes[i][0], color='green')

    sns.distplot(df_train[feature], label='df_train', ax=axes[i][1])
    sns.distplot(df_test[feature], label='df_test', ax=axes[i][1])
    
    for j in range(2):
        axes[i][j].set_xlabel('')
        axes[i][j].tick_params(axis='x', labelsize=6)
        axes[i][j].tick_params(axis='y', labelsize=6)
        axes[i][j].legend()
    
    axes[i][0].set_title(f'[{feature}] Distribution in df_train', fontsize=9)
    axes[i][1].set_title(f'[{feature}] Distribution in df_train & df_test', fontsize=9)

plt.show()

In [None]:
# Visualize
fig, axes = plt.subplots(ncols=2, figsize=(8, 3), dpi=100)
plt.tight_layout()

HATESPEECH_TWEETS = df_train['class'] == 2
OFFENSIVE_TWEETS = df_train['class'] == 1
NEITHER_TWEETS = df_train['class'] == 0

df_train.groupby('class').count()['count'].plot(kind='pie', ax=axes[0], labels=['Neither', 'Offensive', 'HateSpeech'], subplots=True)
sns.countplot(x=df_train['class'], hue=df_train['class'], ax=axes[1])

axes[0].set_ylabel('')
axes[1].set_ylabel('')
axes[1].set_xticklabels(['Neither', 'Offensive', 'HateSpeech'])
axes[0].tick_params(axis='x', labelsize=6)
axes[0].tick_params(axis='y', labelsize=6)
axes[1].tick_params(axis='x', labelsize=6)
axes[1].tick_params(axis='y', labelsize=6)

axes[0].set_title('Target Distribution in Training Set', fontsize=9)
axes[1].set_title('Target Count in Training Set', fontsize=9)

plt.show()