# M2.2 Hate Speech


## Pre processing

### Fetch Data

In [0]:
!wget -q https://transfer.sh/Zgwhy/twitter_hate_speech.csv -O hate

### Load and install packages

In [0]:
# Filtering annmoying warning spam
from warnings import simplefilter
simplefilter(action='ignore', category=(FutureWarning,DeprecationWarning,UserWarning))

In [0]:
# The usual suspects
import pandas as pd
import numpy as np

In [0]:
# Tools
import re
import multiprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBClassifier

In [0]:
# Natural language packages, part 1
!pip -q install eli5
import eli5
from eli5.lime import TextExplainer

In [234]:
# Natural language packages, part 2
import nltk as n
from nltk.tokenize import word_tokenize, sent_tokenize
n.download('punkt')
n.download('stopwords')
import gensim
import gensim.downloader as gapi
from gensim.corpora.dictionary import Dictionary
from gensim.models import FastText, LdaMulticore, Word2Vec
import spacy

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
# Visualization packages
!pip -q install pyLDAvis
import pyLDAvis.gensim

## Opgave 1

### Inspect data

In [236]:
hs = pd.read_csv('hate')
hs.info()
hs.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 3 columns):
Unnamed: 0    24783 non-null int64
class         24783 non-null int64
tweet         24783 non-null object
dtypes: int64(2), object(1)
memory usage: 580.9+ KB


Unnamed: 0.1,Unnamed: 0,class,tweet
0,0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


### Process data

Data will be vectorized, split into test and train sets at a arbitrary 2:1 ratio.

In [0]:
# Vectorization and truncation
vectorizer = TfidfVectorizer()
classifier = LogisticRegression()
model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])
model.fit(hs['tweet'], hs['class'])

vec = TfidfVectorizer()
svd = TruncatedSVD(n_components=100, n_iter=10, random_state=69)
lsa = make_pipeline(vec, svd)

In [0]:
# Splitting data into training and testing data at 2:1 ratio.
a = 2 * int(len(hs)/3)
train = hs.iloc[:a]
test = hs.iloc[a:]

In [239]:
# Inspecting the split
print('== Train data ==')
train.info()
print('===================================')
print('== Test data ==')
test.info()

== Train data ==
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16522 entries, 0 to 16521
Data columns (total 3 columns):
Unnamed: 0    16522 non-null int64
class         16522 non-null int64
tweet         16522 non-null object
dtypes: int64(2), object(1)
memory usage: 387.3+ KB
== Test data ==
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8261 entries, 16522 to 24782
Data columns (total 3 columns):
Unnamed: 0    8261 non-null int64
class         8261 non-null int64
tweet         8261 non-null object
dtypes: int64(2), object(1)
memory usage: 193.7+ KB


### Tokenization

In [0]:
# Create list of review from datasets for further processing
texts = list(train.tweet)
texts.extend(test.tweet) 

# Split reviews into sentences
sentences = []
for text in texts:
  sentences.extend(sent_tokenize(text))
  
# Create word tokens
tokenized_texts = [word_tokenize(text) for text in sentences]
tokenized_texts = list(map(lambda x: [y.lower() for y in x], tokenized_texts))

In [0]:
# Word2Vec model
model = Word2Vec(tokenized_texts, 
                 size=250,     # embedding vector size
                 min_count=7,  # consider words that occured at least 7 times
                 window=7,     # define context as a 7-word window around the target word
                 max_final_vocab = 3000).wv 

In [0]:
# All the tokenized words
words = sorted(model.vocab.keys(), 
               key=lambda word: model.vocab[word].count,
               reverse=True)[:1000]

In [0]:
# Return word-vectors
word_vectors = model.vectors[[model.vocab[word].index for word in words]]

In [244]:
# Average-vector-representations
word_vectors

array([[ 4.5234323e-02,  6.4165547e-02,  3.8024735e-01, ...,
        -4.9930014e-02, -5.8162298e-02, -4.7597063e-01],
       [-9.1445267e-01, -1.2779546e-01,  6.9595933e-01, ...,
        -4.4969469e-01, -4.9344185e-01,  4.0883893e-01],
       [-1.3795123e+00, -1.0621551e-01,  4.3548852e-01, ...,
        -4.3454298e-01, -7.0303893e-01,  2.9149324e-01],
       ...,
       [-1.2874030e-01,  9.4156861e-02,  1.0581856e-01, ...,
        -2.7854444e-02, -1.8184440e-01,  9.3350738e-02],
       [ 1.5211017e-01,  9.3348704e-02, -1.7831408e-04, ...,
        -3.1144835e-02, -1.7322108e-01, -2.2788556e-02],
       [ 1.3812295e-01,  1.4111911e-01, -4.2907625e-02, ...,
        -4.8778083e-02, -2.4063264e-01,  8.2185520e-03]], dtype=float32)

## Opgave 2

Count based operations

In [245]:
pd.options.mode.chained_assignment = None  # Muting panda warnings from chained operations
train['hate'] = train['class'] == 0
train['offensive'] = train['class'] == 1
train['neither'] = train['class'] == 2
train.head()

Unnamed: 0.1,Unnamed: 0,class,tweet,hate,offensive,neither
0,0,2,!!! RT @mayasolovely: As a woman you shouldn't...,False,False,True
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,False,True,False
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,False,True,False
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,False,True,False
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,False,True,False


In [246]:
test['hate'] = test['class'] == 0
test['offensive'] = test['class'] == 1
test['neither'] = test['class'] == 2
test.head()

Unnamed: 0.1,Unnamed: 0,class,tweet,hate,offensive,neither
16522,16522,0,RT @MrHoratioSanz: Vin Scully once called me a...,True,False,False
16523,16523,1,RT @MrLuckyToYou: Eat that pussy from the back...,False,True,False
16524,16524,1,"RT @MrLuckyToYou: If that pussy good, umma moa...",False,True,False
16525,16525,1,"RT @MrNBAallday: Kobe calls Dwight Howard a ""b...",False,True,False
16526,16526,1,RT @MrNationWide: fake eyelashes are okay if t...,False,True,False


In [247]:
print('====================')
print('=== Training set ===')
print(train.groupby('hate').count())
print(train.groupby('offensive').count())
print('====================')
print('=== Test set =======')
print(train.groupby('hate').count())
print(train.groupby('offensive').count())

=== Training set ===
       Unnamed: 0  class  tweet  offensive  neither
hate                                               
False       15453  15453  15453      15453    15453
True         1069   1069   1069       1069     1069
           Unnamed: 0  class  tweet   hate  neither
offensive                                          
False            3859   3859   3859   3859     3859
True            12663  12663  12663  12663    12663
       Unnamed: 0  class  tweet  offensive  neither
hate                                               
False       15453  15453  15453      15453    15453
True         1069   1069   1069       1069     1069
           Unnamed: 0  class  tweet   hate  neither
offensive                                          
False            3859   3859   3859   3859     3859
True            12663  12663  12663  12663    12663


Lets have a look at tweets that contain 'trump' or 'Trump' and their relation to hate or offensive

In [248]:
train['Trump'] = train.tweet.str.contains(('Trump') or ('trump'), na = False)
train[train['Trump'] == True].head()

print('====================')
print('=== Trump Chart ====')
print(train.groupby('hate').Trump.value_counts())
print(train.groupby('offensive').Trump.value_counts())

=== Trump Chart ====
hate   Trump
False  False    15449
       True         4
True   False     1068
       True         1
Name: Trump, dtype: int64
offensive  Trump
False      False     3858
           True         1
True       False    12659
           True         4
Name: Trump, dtype: int64


'Trump' is not particularly related to hate tweets, but rather overrepresented in 'offensive' tweets. Regardless of your political position, this is rather hillarious, either because it implies 'trump' is offensive, or twitter is using 'interesting' algorithms.

Next, let's prepare data for visualization

In [0]:
# Fire up spacy
nlp = spacy.load("en")

In [0]:
# Pipe through spacy, to get tokens
tokens = []
for tweet in nlp.pipe(train['tweet']):
  proj_tok = [token.lemma_.lower() for token in tweet if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] and not token.is_stop] 
  tokens.append(proj_tok)

In [0]:
# Train and filter the tokens
train['tokens'] = tokens
dictionary = Dictionary(train['tokens'])
dictionary.filter_extremes(no_below=4, no_above=0.6, keep_n=200)
corpus = [dictionary.doc2bow(doc) for doc in train['tokens']]

Alright, let's model the data and visualize

In [0]:
# Model construction
%matplotlib inline
pyLDAvis.enable_notebook()
lda_model = LdaMulticore(corpus, id2word=dictionary,  num_topics=10, workers = 2, passes=10)
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)

In [253]:
# Hold on to your butts
pyLDAvis.display(lda_display)

The plot displayed is interactive, so readers viewing the HTML are invited to check out the colab.

From the interactive plot, themes that are unique to topics can be identified by adjusting lambda.

## Opgave 3

Classification model

In [0]:
# Create model
model_gensim_glove = gapi.load('glove-wiki-gigaword-300')
model = model_gensim_glove

In [0]:
# Turn input text into average vector
# Function heavily inspired by Stackoverflow
def get_phrase_embedding(model, phrase):    
    vector = np.zeros([model.vector_size], dtype='float32')
    if type(phrase) == str:
      phrase = list(map(lambda x: x.lower(), word_tokenize(phrase)))
    vecs = [model.get_vector(tok) for tok in phrase if tok in model.wv.vocab]
    if len(vecs) == 0:
      return vector
    else:
      vector = sum(vecs)/len(vecs)
      return vector

In [0]:
# Preb work
b = multiprocessing.Pool()
train_tok = b.map(word_tokenize, train.tweet)
test_tok = b.map(word_tokenize, test.tweet)

In [257]:
# Vectorizing train and test.
text_vectors_train = np.array([get_phrase_embedding(model, phrase) for phrase in train_tok])
text_vectors_test = np.array([get_phrase_embedding(model, phrase) for phrase in test_tok])
print(text_vectors_train.shape)

(16522, 300)


In [258]:
clf1 = LogisticRegression()
clf1.fit(text_vectors_train,train['class'])
print()




In [259]:
clf2 = LogisticRegression()
clf2.fit(text_vectors_train,train['hate'])
print()




In [260]:
clf3 = LogisticRegression()
clf3.fit(text_vectors_train,train['offensive'])
print()




In [261]:
clf4 = LogisticRegression()
clf4.fit(text_vectors_train,train['neither'])
print()




In [262]:
print('================================')
print('=== General precision:')
print(clf1.score(text_vectors_test,test['class']))
print('================================')
print('=== Hate speech precision:')
print(clf2.score(text_vectors_test,test['hate']))
print('================================')
print('=== Offensive speech precision:')
print(clf3.score(text_vectors_test,test['offensive']))
print('================================')
print('=== Neither-class precision:')
print(clf4.score(text_vectors_test,test['neither']))


=== General precision:
0.8807650405519912
=== Hate speech precision:
0.9567848928701126
=== Offensive speech precision:
0.8850018157608037
=== Neither-class precision:
0.9199854739135698
