## Cyberbullying Auto Detection
*Wenqu Wang, Casey Yoon*

### Import Packages

In [69]:
# NumPy, TensorFlow, os
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout

from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer

In [2]:
os.listdir()

['twitter_sentiment.ipynb',
 'twitter_racism_parsed_dataset.csv',
 '.git',
 'twitter_parsed_dataset.csv',
 'Untitled.ipynb',
 '.ipynb_checkpoints',
 'twitter_sexism_parsed_dataset.csv']

In [3]:
### We aggregate all the data into one dataframe

parsed = pd.read_csv('twitter_parsed_dataset.csv')
racism = pd.read_csv('twitter_racism_parsed_dataset.csv')
sexism = pd.read_csv('twitter_sexism_parsed_dataset.csv')

twitter_data = pd.concat([parsed, racism, sexism]).dropna()
twitter_data.head()

Unnamed: 0,index,id,Text,Annotation,oh_label
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(twitter_data['Text'], twitter_data['oh_label'], test_size=0.20, random_state=42)

X_train.head()

9327     There is such a diff between reality &amp; wha...
14633    Katie's a fatty!! Model!!!! Hahahaha #MKR #kil...
4197     @Nibelsnarfabarf @srhbutts @GRIMACHU it is rea...
3534     @MaxOfS2D @StephenAtWar Origin is a flaming pi...
4500     No, you don't. @Shut_Up_Jeff: I thought of a r...
Name: Text, dtype: object

In [5]:
y_train.head()

9327     0.0
14633    1.0
4197     0.0
3534     0.0
4500     1.0
Name: oh_label, dtype: float64

In [61]:
vec = TfidfVectorizer(max_features=500)
X_vectrain = vec.fit_transform(X_train).toarray()
X_vectest = vec.transform(X_test).toarray()

In [62]:
### Baseline accuracy, predicting all of one class.
1 - np.mean(y_test)

0.7609513274336284

### Logistic Regression

In [63]:
### Logistic Regression
clf = LogisticRegression(random_state=0).fit(X_vectrain, y_train)
pred = clf.predict(X_vectest)
print("f1_score = ", metrics.f1_score(y_test, pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(y_test, pred))

f1_score =  0.8433159753737457
accuracy =  0.8535398230088496


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Neural Network

In [64]:
model = Sequential()
model.add(Dense(64,input_shape=(500,)))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(64))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 64)                32064     
_________________________________________________________________
dropout_13 (Dropout)         (None, 64)                0         
_________________________________________________________________
activation_12 (Activation)   (None, 64)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_14 (Dropout)         (None, 64)                0         
_________________________________________________________________
activation_13 (Activation)   (None, 64)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 1)                

In [65]:
model.fit(X_vectrain,y_train,batch_size=32,epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb7d92b0b50>

In [66]:
pred = model.predict_classes(X_vectest)

In [67]:
print("f1_score = ", metrics.f1_score(y_test, pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(y_test, pred))

f1_score =  0.9468722025892737
accuracy =  0.9484513274336284


### CNN

In [75]:
model = Sequential()
embedding_dim = 5
model.add(layers.Embedding(500, embedding_dim, input_length=500))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 5)            2500      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 496, 128)          3328      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_27 (Dense)             (None, 1)                 11        
Total params: 7,129
Trainable params: 7,129
Non-trainable params: 0
_________________________________________________________________


In [76]:
model.fit(X_vectrain,y_train,batch_size=32,epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb7ca889df0>

In [77]:
pred = model.predict_classes(X_vectest)
print("f1_score = ", metrics.f1_score(y_test, pred, average="weighted"))
print("accuracy = ", metrics.accuracy_score(y_test, pred))

f1_score =  0.6576523878906878
accuracy =  0.7609513274336284
