In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv
import pandas as pd
import numpy as np

In [2]:
with open('jigsaw-toxic-comment-classification-challenge/train.csv','r') as file:
    csv_reader = csv.reader(file,delimiter=',')
    next(csv_reader)
    comments = []
    for row in csv_reader:
        comments.append(row[1])
        
train = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
labels = train[list_classes].values
        
## Reading the test data and seperating the comments and the labels

with open('jigsaw-toxic-comment-classification-challenge/test.csv','r') as file:
    csv_reader = csv.reader(file,delimiter=',')
    next(csv_reader)
    test_comments = []
    test_labels = []
    for row in csv_reader:
        test_comments.append(row[1])
test = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test.csv')
test.head()


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [4]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

In [5]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(comments)


In [6]:
word_index = tokenizer.word_index

In [7]:
training_sequences = tokenizer.texts_to_sequences(comments)
training_padded = pad_sequences(training_sequences,maxlen=max_length,padding=padding_type,truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_comments)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,padding=padding_type,truncating=trunc_type)

# Building a simple model

In [8]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(24,activation='relu'),
    tf.keras.layers.Dense(6,activation='softmax')
    
])

model.summary()
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                20736     
_________________________________________________________________
dense (Dense)                (None, 24)                1560      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 150       
Total params: 182,446
Trainable params: 182,446
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 10
history = model.fit(x=training_padded,y=labels,epochs=num_epochs,verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
 - 325s - loss: 0.3018 - acc: 0.9934
Epoch 2/10


In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Accuracy'])
plt.show()