In [77]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [78]:
%%time
toxic_comments_df = pd.read_csv('train.csv',nrows=50000)
toxic_comments_df =  toxic_comments_df[['id','target','comment_text']]


CPU times: user 451 ms, sys: 75.5 ms, total: 527 ms
Wall time: 537 ms


In [79]:
#creating our truth column. comment is toxic if target >= .5
toxic_comments_df['truth'] = toxic_comments_df.target.apply(lambda x: 1 if x>=.5 else 0 ).astype('float32')
toxic_comments_df.head()


Unnamed: 0,id,target,comment_text,truth
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0
3,59855,0.0,Is this something I'll be able to install on m...,0.0
4,59856,0.893617,haha you guys are a bunch of losers.,1.0


In [80]:
toxic_comments_df['InputData'] = toxic_comments_df['comment_text']
#toxic_comments_df['InputData'] = toxic_comments_df['comment_text'].str.replace('[{}]'.format(string.punctuation), '')
toxic_comments_df['InputData'] = toxic_comments_df['InputData'].apply(lambda x: " ".join([ word.lower() if word[0].isalpha() else ""  for word in str(x).split()]))

toxic_comments_df.head()

Unnamed: 0,id,target,comment_text,truth,InputData
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,"this is so cool. it's like, you want your mot..."
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,thank you!! this would make my life a lot less...
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,this is such an urgent design problem; kudos t...
3,59855,0.0,Is this something I'll be able to install on m...,0.0,is this something i'll be able to install on m...
4,59856,0.893617,haha you guys are a bunch of losers.,1.0,haha you guys are a bunch of losers.


In [81]:
X = toxic_comments_df.InputData
Y = toxic_comments_df.truth
le = LabelEncoder()
Y = le.fit_transform(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3)


In [82]:
%%time
max_words = 2000
max_len = 2000
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

CPU times: user 3.03 s, sys: 105 ms, total: 3.14 s
Wall time: 3.2 s


In [83]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [84]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 2000)              0         
_________________________________________________________________
embedding_24 (Embedding)     (None, 2000, 50)          100000    
_________________________________________________________________
lstm_17 (LSTM)               (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_27 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_14 (Dropout)         (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

In [85]:
model.fit(sequences_matrix,Y_train,batch_size=256,epochs=10,
          validation_split=0.4,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 21000 samples, validate on 14000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x1a3004ad68>

In [86]:
model.save("kerasRNN.h")