# Hate Speech Modelling Notebook

Ashwin U Iyer

19BAI1118

In [1]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim

import regex as re

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df=pd.read_csv("../input/cleaned_data.csv")

In [3]:
TOPICS = 100

def strip_newline(series):
    return [review.replace('\n','') for review in series]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod
    
def get_corpus(df):
    df['body'] = strip_newline(df.body)
    words = list(sent_to_words(df.body))
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

train_corpus, train_id2word, bigram_train = get_corpus(df)

## Making Topic Modelling based features using LDA

In [4]:
lda_train = gensim.models.ldamulticore.LdaMulticore(
                       corpus=train_corpus,
                       num_topics=TOPICS,
                       id2word=train_id2word,
                       chunksize=100,
                       workers=7, # Num. Processing Cores - 1
                       passes=50,
                       eval_every = 1,
                       per_word_topics=True)

lda_train.save('../models/LDA/lda_train.model')

lda_train.print_topics(TOPICS, num_words=15)[:3]


[(0,
  '0.149*"person" + 0.101*"great" + 0.065*"means" + 0.063*"people" + 0.058*"culture" + 0.040*"nt" + 0.039*"like" + 0.038*"mean" + 0.038*"would" + 0.037*"someone" + 0.026*"protest" + 0.024*"assume" + 0.023*"time" + 0.021*"feeling" + 0.020*"ex"'),
 (1,
  '0.146*"tax" + 0.117*"power" + 0.078*"taxes" + 0.076*"pay" + 0.050*"destroy" + 0.049*"religion" + 0.042*"organization" + 0.041*"rate" + 0.039*"ready" + 0.036*"bought" + 0.031*"idea" + 0.024*"nt" + 0.023*"committed" + 0.023*"income" + 0.022*"every_day"'),
 (2,
  '0.117*"free" + 0.114*"live" + 0.074*"happy" + 0.065*"get" + 0.057*"provide" + 0.039*"could" + 0.036*"time" + 0.036*"clear" + 0.034*"value" + 0.031*"degree" + 0.029*"yea" + 0.023*"taste" + 0.022*"eye" + 0.022*"forced" + 0.021*"battle"')]

In [5]:
train_vecs = []
for i in range(len(df)):
    top_topics = (
        lda_train.get_document_topics(train_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(TOPICS)]
    train_vecs.append(topic_vec)

In [7]:
X = np.array(train_vecs)
y = np.array(df.score)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [13]:
def evaluate(y_true,predictions):
    print("MAE: ",metrics.mean_absolute_error(y_true,predictions))
    print("RMSE: ",np.sqrt(metrics.mean_absolute_error(y_true,predictions)))
    print("R2: ",metrics.r2_score(y_true,predictions))

In [14]:
lr=LinearRegression()
lr.fit(X_train,y_train)
predictions=lr.predict(X_test)
evaluate(y_test,predictions)

MAE:  0.2406378729371738
RMSE:  0.4905485428957809
R2:  0.17116118577909012


## Word Embedding based Features

In [16]:
DIMS = 100

def getGloveCorpus(dims=300):
    # Set path and load corpus
    path = '../input/'
    filename = f'glove.6B.{dims}d.txt'
    f = open(path+filename, 'r', encoding='latin2')
    vec_txt = f.read()

    vec_data = {}
    words = vec_txt.split('\n')
    for word in words:
        vec = word.split()
        if len(vec) == dims+1:
            vec_data[vec[0]] = np.array([np.float16(x) for x in vec[1:]])
    vec = pd.DataFrame(vec_data, columns=None).transpose()
    return vec

VEC = getGloveCorpus(dims=DIMS)
VEC

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.038208,-0.244873,0.728027,-0.399658,0.083191,0.043945,-0.391357,0.334473,-0.575684,0.087463,...,0.016220,-0.017105,-0.389893,0.874023,-0.725586,-0.510742,-0.520508,-0.145874,0.827637,0.270508
",",-0.107666,0.110535,0.598145,-0.543457,0.673828,0.106628,0.038879,0.354736,0.063538,-0.094177,...,0.349609,-0.722656,0.375488,0.444092,-0.990723,0.612305,-0.351074,-0.831543,0.452881,0.082581
.,-0.339844,0.209351,0.463379,-0.647949,-0.383789,0.038025,0.171265,0.159790,0.466309,-0.019165,...,-0.063354,-0.674316,-0.068909,0.536133,-0.877930,0.318115,-0.392334,-0.233887,0.472900,-0.028809
of,-0.152954,-0.242798,0.898438,0.169922,0.535156,0.487793,-0.588379,-0.179810,-1.358398,0.425293,...,0.187134,-0.018494,-0.267578,0.727051,-0.593750,-0.348389,-0.561035,-0.590820,1.003906,0.206665
to,-0.189697,0.050018,0.190796,-0.049194,-0.089722,0.210083,-0.549316,0.098389,-0.201294,0.342529,...,-0.131348,0.058624,-0.318604,-0.614258,-0.624023,-0.415527,-0.038177,-0.397949,0.476562,-0.159790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chanty,-0.155762,-0.049194,-0.064392,0.223633,-0.201416,-0.038971,0.129761,-0.294434,0.003590,-0.098389,...,0.093323,0.094482,-0.023468,-0.480957,0.623535,0.024323,-0.275879,0.075073,-0.563965,0.145020
kronik,-0.094421,0.147217,-0.157349,0.071960,-0.298340,0.039429,0.021866,0.008041,-0.186768,-0.311035,...,-0.305420,-0.011086,0.118530,-0.113098,0.339600,-0.224487,0.257324,0.631348,-0.200928,-0.105408
rolonda,0.360840,-0.169189,-0.327148,0.098328,-0.429688,-0.188721,0.455566,0.285400,0.303467,-0.366943,...,-0.044067,0.140015,0.300049,-0.127319,-0.143066,-0.069397,0.281494,0.271484,-0.291992,0.161133
zsombor,-0.104614,-0.504883,-0.493408,0.135132,-0.363770,-0.447510,0.184326,-0.056519,0.404785,-0.725586,...,0.151489,-0.108398,0.340576,-0.409180,-0.081238,0.095337,0.150146,0.425293,-0.512695,-0.170532


In [18]:
def getGloveVec(word, vec, dims=300):
    vc = np.zeros(dims)
    try:
        vc = np.array(vec.loc[word])
    except:
        vc = np.zeros(dims)
    return vc

def getDocVec(sentence, dims, vec):
    tokens = word_tokenize(sentence)
    vecs = np.zeros(dims)
    
    for word in tokens:
        vecs += getGloveVec(word, vec, dims)

    return vecs

def getVecForm(X, Y, dims, vec):
    vecList = []
    
    for i in X:
        vecList.append(getDocVec(i, dims, vec))
    X = np.asarray(vecList).astype(np.float16)
    Y = np.asarray(Y).astype(np.float16)

    return X, Y

X, y = getVecForm(
    X = df['body'],
    Y = df['score'],
    vec = VEC,
    dims=DIMS
)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
lr=LinearRegression()
lr.fit(X_train,y_train)
predictions=lr.predict(X_test)
evaluate(y_test,predictions)

MAE:  0.2227483
RMSE:  0.47196218
R2:  0.28568916763149976


Thus, we can see that while the performance is quite similar when the errors (MAE and RMSE) are considered, there is a substantial increase in R2 score for a Linear Regression Model. Thus, we can make an assertion that word embeddings are a slightly better approach than topic modelling with regards to this particular dataset. However, the field of machine learning works on the 'No Free Lunch' theorem, which states that no model or appraoch is applicable to everything. Thus, for this particular model and data split, word embeddings are a better choice.