In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"aryan225","key":"12f47013da4d2570813a7603c9c656e4"}'}

In [4]:
!ls -lrt kaggle.json

-rw-r--r-- 1 root root 64 Sep  7 17:42 kaggle.json


In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
!pip install kaggle
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading sample_submission.csv.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 97.6MB/s]
Downloading test.csv.zip to /content
 38% 9.00M/23.4M [00:00<00:00, 22.5MB/s]
100% 23.4M/23.4M [00:00<00:00, 47.6MB/s]
Downloading train.csv.zip to /content
 65% 17.0M/26.3M [00:00<00:00, 22.6MB/s]
100% 26.3M/26.3M [00:00<00:00, 59.4MB/s]
Downloading test_labels.csv.zip to /content
  0% 0.00/1.46M [00:00<?, ?B/s]
100% 1.46M/1.46M [00:00<00:00, 207MB/s]


In [8]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2019-09-07 17:50:06--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-09-07 17:50:06--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2019-09-07 17:50:06--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-0

In [9]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [10]:
!ls

glove.6B.100d.txt  glove.6B.zip		      test.csv.zip
glove.6B.200d.txt  kaggle.json		      test_labels.csv.zip
glove.6B.300d.txt  sample_data		      train.csv.zip
glove.6B.50d.txt   sample_submission.csv.zip


In [11]:
!unzip sample_submission.csv.zip
!unzip test.csv.zip
!unzip test_labels.csv.zip
!unzip train.csv.zip
!ls

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   
Archive:  test.csv.zip
  inflating: test.csv                
Archive:  test_labels.csv.zip
  inflating: test_labels.csv         
Archive:  train.csv.zip
  inflating: train.csv               
glove.6B.100d.txt  kaggle.json		      test.csv.zip
glove.6B.200d.txt  sample_data		      test_labels.csv
glove.6B.300d.txt  sample_submission.csv      test_labels.csv.zip
glove.6B.50d.txt   sample_submission.csv.zip  train.csv
glove.6B.zip	   test.csv		      train.csv.zip


In [0]:
embed_size = 50 #Word Vector Size
max_features = 20000 #Unique Words
maxlen = 100 #Max Number of Words from comment to use

In [0]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
comment_list_train = train["comment_text"].fillna("_na_").values
classes = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
y = train[classes].values
comment_list_test = test["comment_text"].fillna("_na_").values

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(comment_list_train))
tokenized_train = tokenizer.texts_to_sequences(comment_list_train)
tokenized_test = tokenizer.texts_to_sequences(comment_list_test)
x_train = pad_sequences(tokenized_train, maxlen=maxlen)
x_test = pad_sequences(tokenized_test, maxlen=maxlen)

In [0]:
def get_coefs(word, *arr):
  return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open("glove.6B.50d.txt"))

In [16]:
all_embeddings = np.stack(embeddings_index.values())
embedding_mean, embedding_std = all_embeddings.mean(), all_embeddings.std()
embedding_mean, embedding_std

  """Entry point for launching an IPython kernel.


(0.020940498, 0.6441043)

In [0]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(embedding_mean, embedding_std, (nb_words, embed_size))
for word, i in word_index.items():
  if i >= max_features:
    continue
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [18]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(6, activation='sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [19]:
model.fit(x_train, y, batch_size=32, epochs=2, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f2b1aa92ef0>

In [21]:
y_test = model.predict([x_test], batch_size=1024, verbose=1)
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission[classes] = y_test
sample_submission.to_csv('submission.csv', index=False)



In [0]:
model.save("model.h5")

In [0]:
import pickle
with open('tokenizer.pickle','wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
!kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "First Submission"

100% 21.3M/21.3M [00:04<00:00, 4.63MB/s]
Successfully submitted to Toxic Comment Classification Challenge