In [1]:
import numpy as np 
import pandas as pd
import nltk
nltk.download('stopwords')
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

MAX_FEATURES = 20000
MAX_WORD = 100
EMBEDDING_SIZE = 300



[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
/kaggle/input/googles-trained-word2vec-model-in-python/GoogleNews-vectors-negative300.bin
/kaggle/input/googles-trained-word2vec-model-in-python/GoogleNews-vectors-negative300.bin.gz
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip


Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
sample_submission = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [3]:
def text_prepare(text):
    """Tokenization and Preprocessing."""
    
    # Make everything lowercase
    text = text.lower()
    
    # Remove misspelled words or words not found in GoogleNews embeddings (determined in data exploration and preprocessing)
    text = re.sub("doesnt", "does not", text)
    text = re.sub("dont", "do not", text)
    text = re.sub("isnt", "is not", text)
    text = re.sub("wasnt", "was not", text)
    text = re.sub("didnt", "did not", text)
    text = re.sub("behaviour", "behavior", text)
    text = re.sub("colour", "color", text)
    
    # Replace symbols,newline characters and remove stopwords. Then tokenize sentence 
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    good_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))
    text = re.sub("\n", " ", text)
    text = replace_by_space_re.sub(' ', text)
    text = good_symbols_re.sub('', text)
    text = ' '.join([x for x in nltk_tokenize(text) if x and x not in stopwords_set])
    
    return text.strip()


def nltk_tokenize(text):
    """Used to split a sentence into tokens"""
    
    tokens = word_tokenize(text)
    return tokens


In [12]:
# Clean comments in our training set and test set
train_x = train_df.iloc[:,1:2]
train_y = train_df.iloc[:,2:]
test_x = test_df.iloc[:,1:2]
train_cleaned_x = train_x['comment_text'].map(lambda x: text_prepare(x))
test_cleaned_x = test_x['comment_text'].map(lambda x: text_prepare(x))

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Use keras tokenizer to find 20,000 most important words and prepare for them to be fed into embedding layer
tokenizer = Tokenizer(MAX_FEATURES)
tokenizer.fit_on_texts(list(train_cleaned_x))
train_tokenized_x = tokenizer.texts_to_sequences(train_cleaned_x)
test_tokenized_x = tokenizer.texts_to_sequences(test_cleaned_x)

# Pad input so that they are all of the same length
padded_train_x = pad_sequences(train_tokenized_x, maxlen = MAX_WORD)
padded_test_x = pad_sequences(test_tokenized_x, MAX_WORD)

In [14]:
import gensim

# Use gensim to load GoogleNews pre-trained word embeddings
INPUT_FILE = "../input/googles-trained-word2vec-model-in-python/GoogleNews-vectors-negative300.bin"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(INPUT_FILE, binary=True)

In [15]:
# Convert gensim object into a python dictionary
dict_size = len(tokenizer.word_index)
embedding_index = {}
for word in word2vec.wv.vocab:
    embedding_index[word] = word2vec.word_vec(word)


  after removing the cwd from sys.path.


In [16]:
# Create a matrix of embeddings and pick the default values from the distribution of matrix weights
embeddings = np.stack(list(embedding_index.values()))
embeddings_mean = embeddings.mean()
embeddings_std_dev = embeddings.std()
embedding_matrix = np.random.normal(embeddings_mean, embeddings_std_dev, (len(tokenizer.word_index), EMBEDDING_SIZE))

In [17]:
# For words from the source text (fed into the tokenizer), change the weights in the matrix to match their embedding values
for word, index in tokenizer.word_index.items():
    if word in embedding_index:
        embedding_matrix[index - 1] = embedding_index[word]

In [18]:
# Import model and layers used
from keras.models import Model
from keras.layers import  Embedding, Input, concatenate, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Bidirectional, GRU, Dense

# Use keras to define model
emb = Input(shape=(None,))
x = Embedding(dict_size,embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(emb)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(256, return_sequences = True, dropout = 0.2, recurrent_dropout = 0.2))(x)
x = Conv1D(64, kernel_size = 4)(x)
x = concatenate([GlobalMaxPooling1D()(x), GlobalAveragePooling1D()(x)])
x = Dense(6, activation = "sigmoid")(x)
model = Model(emb, x)
model.compile(loss = 'binary_crossentropy', optimizer= 'adam', metrics = ['accuracy'])

In [19]:
# Fit model on our padded data
classifier = model.fit(padded_train_x, train_y.values, batch_size = 1024, epochs = 5, validation_split = 0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
# Predict outcomes for our test data and save as a csv
prediction = model.predict(padded_test_x, batch_size = 1024)
sample_submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = prediction
sample_submission.to_csv('submission.csv', index = False)


hello
