<a href="https://colab.research.google.com/github/aaubs/ds-master/blob/main/notebooks/M4_TFIDF_W2V_multiclass_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hate Speech Detection

<img src="https://blog.colinbreck.com/content/images/size/w2000/2021/01/twitter-a-love-hate-relationship-1.png" width="500">


# Importing Libraries

In [1]:
# !pip install spacy
!python -m spacy download en_core_web_md -q

2023-02-15 09:09:57.888133: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-15 09:10:00.393175: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-15 09:10:00.393356: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-15 09:10:03.882110: E tensorfl

In [2]:
import pandas as pd
import numpy as np

import spacy
import re

#First download SpaCy's en_core_web_md model then load 
#!python -m spacy download en_core_web_md
import en_core_web_md
nlp = en_core_web_md.load() 

import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English


import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,XGBRFClassifier

%matplotlib inline
import warnings
warnings.filterwarnings(action = 'ignore')



Data Description:
==============

* count = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were     determined to be unreliable by CF).

* hate_speech = number of CF users who judged the tweet to be hate speech.

* offensive_language = number of CF users who judged the tweet to be offensive.

* neither = number of CF users who judged the tweet to be neither offensive nor non-offensive.

* class = class label for majority of CF users.

   0 - hate speech
   1 - offensive language 
   2 - neither

* tweet = text data(tweet)

In [7]:
dataset = pd.read_csv("https://raw.githubusercontent.com/aaubs/ds-master/main/data/HateSpeechData.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Preprocessing of the tweets

In [8]:
# Create our list of stopwords
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#extending the stopwords to include other words used in twitter such as retweet(rt) etc.
#retweet, fav, follow friday

nlp.Defaults.stop_words |= {"#ff", "ff", "rt",}

In [9]:
def preprocess(tweet):  
    
    # removal of extra spaces
    regex_pat = re.compile(r'\s+')
    tweet_space = tweet.str.replace(regex_pat, ' ')

    # removal of @name[mention]
    regex_pat = re.compile(r'@[\w\-]+')
    tweet_name = tweet_space.str.replace(regex_pat, '')

    # removal of links[https://abc.com]
    giant_url_regex =  re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    tweets = tweet_name.str.replace(giant_url_regex, '')
    # removal of punctuations and numbers
    punc_remove = tweets.str.replace("[^a-zA-Z]", " ") 
    # remove whitespace with a single space
    newtweet=punc_remove.str.replace(r'\s+', ' ')                   
    # remove leading and trailing whitespace 
    newtweet=newtweet.str.replace(r'^\s+|\s+?$','')
    # replace normal numbers with number  
    newtweet=newtweet.str.replace(r'\d+(\.\d+)?','numbr')
 
    # tokenizing
    tokenized_tweet = newtweet.apply(lambda x: x.split()) 
    
 
    for i in range(len(tokenized_tweet)):
        tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
        tweets_p= tokenized_tweet
    
    return tweets_p
   
processed_tweets = preprocess(dataset['tweet'])

In [10]:
# Creating our tokenizer function
def spacy_tokenizer(processed_tweets):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(processed_tweets)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words ]

    # return preprocessed tokens
    return ' '.join(mytokens)

In [11]:
%%time
dataset['final_tweet'] = processed_tweets.apply(lambda x: spacy_tokenizer(x))

CPU times: user 3min 21s, sys: 775 ms, total: 3min 22s
Wall time: 3min 28s


In [12]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,final_tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman shouldn t complain clean house amp man t...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,boy dats cold tyga dwn bad cuffin dat hoe st p...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,dawg fuck bitch start cry confuse shit
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear true faker bitch tell ya


# TF - IDF   Implementation

TF-IDF weight is composed by two terms: the first computes the normalized Term Frequency (TF), aka. the number of times a word appears in a document, divided by the total number of words in that document; the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization:

* TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:

* IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

In [13]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
final_tweet = dataset['final_tweet']
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_features=15000)

# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(final_tweet)
tfidf

<24783x15000 sparse matrix of type '<class 'numpy.float64'>'
	with 189755 stored elements in Compressed Sparse Row format>

In [14]:
X = tfidf # the features we want to analyze
y = dataset['class'] # the labels, or answers, we want to test against

In [15]:
dense_matrix = X.toarray()
dense_matrix.shape

(24783, 15000)

Stratified k-fold cross-validation
==========================
Specifically, we can split a dataset randomly, although in such a way that maintains the same class distribution in each subset. This is called stratification or stratified sampling and the target variable (y), the class, is used to control the sampling process.

For example, we can use a version of k-fold cross-validation that preserves the imbalanced class distribution in each fold. It is called stratified k-fold cross-validation and will enforce the class distribution in each split of the data to match the distribution in the complete training dataset.

It is common, in the case of class imbalances in particular, to use stratified 10-fold cross-validation, which ensures that the proportion of positive to negative examples found in the original distribution is respected in all the folds.

In [16]:
X_train = dense_matrix[:2000]
y_train = dataset['class'][:2000]
X_test = dense_matrix[2000:2200]
y_test = dataset['class'][2000:2200]

# Running the models Using TFIDF

In [17]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor

In [22]:
lr = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [23]:
%%time
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

CPU times: user 3min 3s, sys: 4.24 s, total: 3min 8s
Wall time: 3min 13s


XGBClassifier(objective='multi:softprob')

In [24]:
%%time
print('Model LogisticRegression' + ' ' + str(lr.score(X_test, y_test)))
print('Model RandomForestClassifier' + ' ' + str(rf.score(X_test, y_test)))
print('Model XGBClassifier' + ' ' + str(xgb.score(X_test, y_test)))

Model OLS 0.815
Model EL 0.855
Model RF 0.925
CPU times: user 151 ms, sys: 113 ms, total: 264 ms
Wall time: 149 ms


# Word2vec Implementation

* Word2vec groups the vector of similar words together in the vector space.
* That is it detects similarities mathematically.
* Given enough data, usage and contexts, word2vec can make highly accurate guesses about a word’s meaning based on past           appearances.
* Those guesses can be used to establish a word’s association with other words eg. “man” is to “boy” what “woman” is to “girl”.
* Each word is represented by a vector and in spaCy each vector has 300 dimensions [300 different properties associated,helpful while similarity matching]

In [25]:
def get_vec(x):
  doc = nlp(x)
  return doc.vector

In [26]:
%%time
# Compute the sentence embeddings
sentence_embeddings = []
for index, row in dataset.iterrows():
  sentence_embeddings.append(get_vec(row['final_tweet']))

# Convert the sentence embeddings to a tensor
sentence_embeddings = np.stack(sentence_embeddings)

CPU times: user 2min 56s, sys: 749 ms, total: 2min 57s
Wall time: 3min 2s


In [29]:
X_train = sentence_embeddings[:2000]
y_train = dataset['class'][:2000]
X_test = sentence_embeddings[2000:2200]
y_test = dataset['class'][2000:2200]

# Running the models Using Word2vec

In [30]:
lr = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [31]:
%%time
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

CPU times: user 13.2 s, sys: 295 ms, total: 13.5 s
Wall time: 13.7 s


XGBClassifier(objective='multi:softprob')

In [32]:
%%time
print('Model LogisticRegression' + ' ' + str(lr.score(X_test, y_test)))
print('Model RandomForestClassifier' + ' ' + str(rf.score(X_test, y_test)))
print('Model XGBClassifier' + ' ' + str(xgb.score(X_test, y_test)))

Model LogisticRegression 0.78
Model RandomForestClassifier 0.815
Model XGBClassifier 0.82
CPU times: user 24.5 ms, sys: 2.01 ms, total: 26.5 ms
Wall time: 27.2 ms
