In [1]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.models import Model
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,cleaned_comment_text
0,2f52adcf5a111cd3,"That is about it for for now. Primarily, I wor...",0,0,0,0,0,0,1,primarily work citations either add update rel...
1,819b3339c747286f,"""\n I wasn't aware that peer-reviewed studies ...",0,0,0,0,0,0,1,aware peerreviewed study minimal methodologica...
2,b66e5fffbd70f8fe,"""\nIt's fine to edit for personal gain so long...",0,0,0,0,0,0,1,fine edit personal gain long edit accord basic...
3,fd7f2ec6efe0315d,I did not add these words to the PLANS website...,0,0,0,0,0,0,1,add word plan website contact plan plan skepti...
4,f78b624060552c1a,"""\n\n List of recent changes \n\nRequested by ...",0,0,0,0,0,0,1,list recent change request sarge baldy even th...


In [4]:
df.shape

(50000, 10)

In [5]:
#df_sample = df.head(100)

In [6]:
df_labels = df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]
df_labels.head()


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [7]:
#Basic cleaning - Milo can add her code here
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [8]:
#Create X and y dataframe
X = []
sentences = list(df["cleaned_comment_text"])
for sen in sentences:
    X.append(preprocess_text(str(sen)))

y = df_labels.values

In [9]:
X

['primarily work citations either add update relate format reconcile previous contributions npov bpov format feel free smoke',
 'aware peerreviewed study minimal methodological flaw know page link cite constitute bias source restrain self become extremely sarcastic',
 'fine edit personal gain long edit accord basic policies contribute wikipedia hobby people endless number motivations get involve talk',
 'add word plan website contact plan plan skeptic site language quote strong offensive people plan cite evidence balance point view little content present wikipedia article',
 'list recent change request sarge baldy even though already discuss extensively need minutia cromwell intro article political mean anarchism belief form rulership undesireable abolish need gobbletygook degrees commonality conflict please hide fact proudhon anticommunist proudhon detail go proudhon article anarchist school precede sundry issueoriented sects bullshit misc nonanarchist leftie movements belong anarchis

In [10]:
#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2019)

In [11]:
#Tokenize the sentences
tokenizer = Tokenizer(num_words=5000)#Maximum number of words to keep based on word frequency
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [12]:
print(X_train)

[[   6  435   80 ...    0    0    0]
 [4774   14  132 ...    0    0    0]
 [  10   60  281 ...    0    0    0]
 ...
 [   6   47 1382 ...    0    0    0]
 [1979 1112   49 ...    0    0    0]
 [ 169   81 1042 ...    0    0    0]]


In [13]:
#Glove Embedding import
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [14]:
#Define the neural network model
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(6, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [15]:
print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 100)          7802000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 774       
Total params: 7,920,022
Trainable params: 118,022
Non-trainable params: 7,802,000
_________________________________________________________________
None


In [16]:
history = model.fit(X_train, y_train, batch_size=128, epochs=5, verbose=1, validation_split=0.2)

Train on 32000 samples, validate on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.1452591383099556
Test Accuracy: 0.9614334106445312


In [18]:
pred_output = model.predict(X_test)
pred_output

array([[0.10219342, 0.01057166, 0.05749056, 0.00316101, 0.05670002,
        0.01142034],
       [0.10219103, 0.0105713 , 0.05748889, 0.00316092, 0.05670041,
        0.01142037],
       [0.10219002, 0.01057112, 0.05748799, 0.00316092, 0.05670062,
        0.01142037],
       ...,
       [0.10219142, 0.01057139, 0.05748901, 0.00316095, 0.05670038,
        0.01142031],
       [0.10219449, 0.0105719 , 0.0574913 , 0.0031611 , 0.05669993,
        0.01142025],
       [0.10218778, 0.01057076, 0.05748641, 0.00316074, 0.05670118,
        0.01142043]], dtype=float32)

In [22]:
y_pred = []

for pred in pred_output:
    rounded = [round(x) for x in pred]
    y_pred.append(rounded)
    
# converting the dataypes

y_pred
y_true = np.array(y_test)
y_pred = np.array(y_pred).astype(np.int64)
print(y_true[0].dtype)
print(y_pred[0].dtype)

int64
int64


In [19]:
#Check Overfitting 
#Perform hyperparameter tuning