# Cyberbullying Auto Detection
### Baseline Model and Neural Networks
*Wenqu Wang, Casey Yoon*

### Import Packages

In [1]:
# NumPy, TensorFlow, os
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import model_selection, naive_bayes, svm
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout

from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer

### Data Cleaning

In [2]:
os.listdir()

['graph',
 'twitter_sentiment.ipynb',
 'twitter_racism_parsed_dataset.csv',
 '.git',
 'twitter_parsed_dataset.csv',
 'Untitled.ipynb',
 '.ipynb_checkpoints',
 'graph.png',
 'twitter_sexism_parsed_dataset.csv']

In [3]:
### We aggregate all the data into one dataframe

parsed = pd.read_csv('twitter_parsed_dataset.csv')
racism = pd.read_csv('twitter_racism_parsed_dataset.csv')
sexism = pd.read_csv('twitter_sexism_parsed_dataset.csv')

twitter_data = pd.concat([parsed, racism, sexism]).dropna()
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0


In [4]:
import re

def cleaning_tweets(tweet):
    # 1. Remove Twitter handles (@user)
    users = re.findall("@[\w]*", tweet) # tokenizing
    for user in users:
        tweet = re.sub(user, '', tweet)

    # 2. Remove, Punctuations, Numbers, and Special Characters (keep hashtags)
    tweet = re.sub("[^a-zA-Z#]", " ", tweet)

    # 3. Lowercase all
    tweet = tweet.lower()
    
    # 4. Splitting text into tokens
    tweet = tweet.split()
    
    return tweet


twitter_data['tokenized_tweets'] = twitter_data['Text'].apply(cleaning_tweets)
twitter_data['cleaned_tweets'] = twitter_data['tokenized_tweets'].apply(lambda x: ' '.join(x))
twitter_data['num_tokens'] = twitter_data['tokenized_tweets'].apply(len)
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label,tokenized_tweets,cleaned_tweets,num_tokens
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0,"[i, read, them, in, context, no, change, in, m...",i read them in context no change in meaning th...,18
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0,"[now, you, idiots, claim, that, people, who, t...",now you idiots claim that people who tried to ...,22
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0,"[rt, call, me, sexist, but, when, i, go, to, a...",rt call me sexist but when i go to an auto pla...,19
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,"[wrong, isis, follows, the, example, of, moham...",wrong isis follows the example of mohammed and...,11
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0,"[#mkr, no, no, no, no, no, no]",#mkr no no no no no no,7


In [5]:
X_train, X_test, y_train, y_test = train_test_split(twitter_data['cleaned_tweets'], twitter_data['oh_label'], test_size=0.20, random_state=42)

X_train.head()

9327     there is such a diff between reality amp what ...
14633    katie s a fatty model hahahaha #mkr #killerblo...
4197     it is really funny all the assumptions they ma...
3534                     origin is a flaming piece of shit
4500     no you don t i thought of a really funny joke ...
Name: cleaned_tweets, dtype: object

In [6]:
y_train.head()

9327     0.0
14633    1.0
4197     0.0
3534     0.0
4500     1.0
Name: oh_label, dtype: float64

In [7]:
len(X_test)

9040

### Tf-idf

In [8]:
# vectorize data using Tf-idf
vec = TfidfVectorizer(max_features=500)
X_vectrain = vec.fit_transform(X_train).toarray()
X_vectest = vec.transform(X_test).toarray()

In [9]:
### Baseline accuracy, predicting all of one class.
1 - np.mean(y_test)

0.7609513274336284

### Baseline Logistic Regression

In [10]:
### Logistic Regression
clf = LogisticRegression(random_state=0).fit(X_vectrain, y_train)
pred = clf.predict(X_vectest)
print("f1_score = ", metrics.f1_score(y_test, pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(y_test, pred))

f1_score =  0.8367223652500073
accuracy =  0.8485619469026549


### Ti-idf with Neural Network

In [11]:
#Build NN model
model = Sequential()
model.add(Dense(64,input_shape=(500,)))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                32064     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
activation (Activation)      (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6

In [12]:
model.fit(X_vectrain,y_train,batch_size=32,epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3cd4a23c10>

In [13]:
pred = model.predict_classes(X_vectest)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [14]:
print("f1_score = ", metrics.f1_score(y_test, pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(y_test, pred))

f1_score =  0.953318747352476
accuracy =  0.9542035398230089


#### Error Analysis

In [15]:
pred = pred.reshape((-1,))
a = pd.DataFrame(X_test[pred!=y_test])
indices = [i for i in a.index]
misclassified = twitter_data.iloc[indices,:]

In [16]:
indices = [i for i in a.index]

In [17]:
misclassified = twitter_data.iloc[indices,[2,4]]
misclassified.tail(20)

Unnamed: 0,Text,oh_label
1150,#mkr I think I just threw up in my mouth :(,0.0
3021,So dissapointed Kat &amp; Andre weren't elimin...,0.0
1839,RT @swagd0ctor: @VileIslam @TRobinsonNewEra @O...,1.0
7085,RT @DTNIraq: DTN Iraq: Iraq vows Tikrit victor...,0.0
1113,@Alfonso_AraujoG @ardiem1m @MaxBlumenthal @old...,1.0
11503,"@iFalasteen No, this is what Muslim brutality ...",0.0
12598,@0xabad1dea unless you are the help desk perso...,0.0
6336,Bye bye basic bitches 👋 #mkr,1.0
2408,RT @Amoka: Video supposedly showing Shia milit...,1.0
11982,@mary__kaye she's always hideous! #mkr,1.0


# CNN + Word2Vec

In [19]:
### Build Word2Vec
from gensim.models import Word2Vec

w2v = Word2Vec(twitter_data['tokenized_tweets'],
               size = 200,
               window = 5,
               min_count = 2,
               sg = 1,
               hs = 0,
               negative = 10,
               workers = 32,
               seed = 1)

len(w2v.wv.vocab.keys())

19645

In [20]:
embeddings_index = {}
for w in w2v.wv.vocab.keys():
    embeddings_index[w] = w2v.wv[w]
print('Found %s word vectors.' % len(embeddings_index))

Found 19645 word vectors.


In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
len(tokenizer.word_index)

19013

In [43]:
length = []
for x in X_train:
    length.append(len(x.split()))
print("The longest sentence has length:",max(length))

The longest sentence has length: 33


In [39]:
x_train_seq = pad_sequences(sequences, maxlen=40)

In [24]:
sequences_test = tokenizer.texts_to_sequences(X_test)
x_test_seq = pad_sequences(sequences_test, maxlen=40)

In [25]:
# build embedding matrix
num_words = 20000
embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [26]:
# print first 5 sentences
for x in X_train[:5]:
    print(x)

there is such a diff between reality amp what certain contestants think of their cooking #mkr
katie s a fatty model hahahaha #mkr #killerblondes
it is really funny all the assumptions they make about how it works and how much they are wrong
origin is a flaming piece of shit
no you don t i thought of a really funny joke and i promise i m not sexist but i have to say it


In [27]:
# first five sentences with embeddings
sequences[:5]

[[52, 8, 283, 6, 2357, 373, 622, 51, 36, 816, 533, 72, 9, 65, 177, 3],
 [228, 17, 6, 12235, 983, 1633, 3, 442],
 [13,
  8,
  84,
  297,
  39,
  1,
  3592,
  21,
  96,
  38,
  56,
  13,
  768,
  7,
  56,
  121,
  21,
  18,
  226],
 [3372, 8, 6, 8063, 681, 9, 134],
 [34,
  11,
  44,
  5,
  2,
  257,
  9,
  6,
  84,
  297,
  458,
  7,
  2,
  2196,
  2,
  27,
  19,
  45,
  22,
  2,
  26,
  4,
  116,
  13]]

In [28]:
# make sure the embedding matches the word
np.array_equal(embedding_matrix[8] ,embeddings_index.get('is'))

True

In [36]:
#build CNN model
model = Sequential()
embedding_dim = 5
model.add(layers.Embedding(20000, 200, weights=[embedding_matrix], input_length=40, trainable=True))
model.add(layers.Conv1D(128, 5, padding='valid',activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(Dropout(0.2))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['acc'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 200)           4000000   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 36, 128)           128128    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 4,129,429
Trainable params: 4,129,429
Non-trainable params: 0
____________________________________________

In [30]:
model.fit(x_train_seq,y_train,batch_size=32,epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3cd4a23be0>

In [31]:
pred = model.predict_classes(x_test_seq)
print("f1_score = ", metrics.f1_score(y_test, pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(y_test, pred))

f1_score =  0.9802840691056793
accuracy =  0.9804203539823009


#### Error Analysis

In [32]:
pred = pred.reshape((-1,))
a = pd.DataFrame(X_test[pred!=y_test])
indices = [i for i in a.index]
misclassified = twitter_data.iloc[indices,:]
misclassified

Unnamed: 0,index,id,Text,Annotation,oh_label,tokenized_tweets,cleaned_tweets,num_tokens
11265,5.75596510114533E+017,5.75596510114533E+017,RT @mary__kaye: That face Kat just pulled was ...,sexism,1.0,"[rt, that, face, kat, just, pulled, was, hideo...",rt that face kat just pulled was hideous #mkr,9
16732,5.53019765397017E+017,5.53019765397017E+017,@azmoderate @JoeWSJ Be a man. Stop babbling an...,sexism,1.0,"[be, a, man, stop, babbling, and, squirming, a...",be a man stop babbling and squirming and admit...,12
6975,5.62393877504487E+017,5.62393877504487E+017,RT @Joyce_Karam: Back to Mongols? #ISIS destro...,none,0.0,"[rt, back, to, mongols, #isis, destroys, books...",rt back to mongols #isis destroys books in mos...,21
5948,5.75644734963606E+017,5.75644734963606E+017,"@australiacamper Yep, same. But it's also wron...",none,0.0,"[yep, same, but, it, s, also, wrong, of, #mkr,...",yep same but it s also wrong of #mkr to let th...,17
8977,5.76515212511179E+017,5.76515212511179E+017,@BrownBagPantry @LaurieJWillberg I haven't see...,none,0.0,"[i, haven, t, seen, anything, that, falls, und...",i haven t seen anything that falls under the l...,17
...,...,...,...,...,...,...,...,...
2408,5.64487955855573E+017,5.64487955855573E+017,RT @Amoka: Video supposedly showing Shia milit...,racism,1.0,"[rt, video, supposedly, showing, shia, militia...",rt video supposedly showing shia militia killi...,22
13301,5.75957369764454E+017,5.75957369764454E+017,RT @AusPolQuestTime: But wait ... new entries...,none,0.0,"[rt, but, wait, new, entries, from, canberra, ...",rt but wait new entries from canberra in mega ...,17
6312,5.73395089969435E+017,5.73395089969435E+017,@GaminGlennSeto @srhbutts Closer to 200. I wro...,none,0.0,"[closer, to, i, wrote, it, in, minutes, and, w...",closer to i wrote it in minutes and was drunk ...,13
3826,5.7560918512923E+017,5.7560918512923E+017,Don't know about you but I'm soooo over so cal...,none,0.0,"[don, t, know, about, you, but, i, m, soooo, o...",don t know about you but i m soooo over so cal...,16
