In [1]:
# This code is a mix of the Benchmark Kernel provided by Jigsaw, and my own adjustments to the code, 
# including but not limited to adjusting sizes of layers, number of layers, and hyperparameters.
# I also completely changed the training set to only contain boolean values rather than multiple labels,
# to focus on getting this network to identify toxicity alone before anything else.
# This also required adjustments in our accuracy analytics.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import os
import pandas as pd
import numpy as np
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats

from sklearn import metrics
from sklearn import model_selection

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.models import Model
from keras.models import load_model

Using TensorFlow backend.


## Data processing

In [4]:
#PROVIDED BY JIGSAW
train = pd.read_csv('input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
print('loaded %d records' % len(train))

# Make sure all comment_text values are strings
train['comment_text'] = train['comment_text'].astype(str) 

# List all identities
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Convert taget and identity columns to booleans
def convert_to_bool(df, col_name):
    df[col_name] = np.where(df[col_name] >= 0.5, True, False)
    
def convert_dataframe_to_bool(df):
    bool_df = df.copy()
    for col in ['target'] + identity_columns:
        convert_to_bool(bool_df, col)
    return bool_df

#CREATED BY ME TO SIMPLIFY TRAINING
train = convert_dataframe_to_bool(train)
train_s=train.drop([
    'severe_toxicity',
    'obscene',
    'identity_attack',
    'insult', 
    'threat',
    'toxicity_annotator_count',
    'asian',
    'atheist',
    'bisexual',
    'black',
    'buddhist',
    'christian',
    'female',
    'heterosexual',
    'hindu',
    'homosexual_gay_or_lesbian',
    'intellectual_or_learning_disability',
    'jewish',
    'latino',
    'male',
    'muslim',
    'other_disability',
    'other_gender',
    'other_race_or_ethnicity',
    'other_religion',
    'other_sexual_orientation',
    'physical_disability',
    'psychiatric_or_mental_illness',
    'transgender',
    'white',
    'created_date',
    'publication_id',
    'parent_id',
    'article_id',
    'rating',
    'funny',
    'wow',
    'sad',
    'likes',
    'disagree',
    'sexual_explicit',
    'identity_annotator_count'
],axis = 1)

loaded 1804874 records


## I made the validation set 5% and everything else training

In [5]:
train_df, validate_df = model_selection.train_test_split(train_s, test_size=0.05)
print('%d train comments, %d validate comments' % (len(train_df), len(validate_df)))


1714630 train comments, 90244 validate comments


## Create a text tokenizer (unedited, provided by Jigsaw)

In [6]:
MAX_NUM_WORDS = 10000
TOXICITY_COLUMN = 'target'
TEXT_COLUMN = 'comment_text'

# Create a text tokenizer.
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df[TEXT_COLUMN])

# All comments must be truncated or padded to be the same length.
MAX_SEQUENCE_LENGTH = 250
def pad_text(texts, tokenizer):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)

## Code for the CNN

In [None]:
#to help create word embeddings using GloVe
EMBEDDINGS_PATH = '../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'
EMBEDDINGS_DIMENSION = 100

#hyperparameters
DROPOUT_RATE = 0.35
LEARNING_RATE = 0.00005
NUM_EPOCHS = 10
BATCH_SIZE = 128

#the only things below i didn't adjust were the word embeddings

def train_model(train_df, validate_df, tokenizer):
    # Prepare data
    train_text = pad_text(train_df[TEXT_COLUMN], tokenizer)
    train_labels = to_categorical(train_df[TOXICITY_COLUMN])
    validate_text = pad_text(validate_df[TEXT_COLUMN], tokenizer)
    validate_labels = to_categorical(validate_df[TOXICITY_COLUMN])

    # Load embeddings
    print('loading embeddings')
    embeddings_index = {}
    with open(EMBEDDINGS_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1,
                                 EMBEDDINGS_DIMENSION))
    num_words_in_embedding = 0
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            num_words_in_embedding += 1
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # Create model layers.
    def get_convolutional_neural_net_layers():
        """Returns (input_layer, output_layer)"""
        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                    EMBEDDINGS_DIMENSION,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    trainable=False)
        x = embedding_layer(sequence_input)
        x = Conv1D(128, 2, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(128, 3, activation='relu', padding='same')(x)
        x = MaxPooling1D(5, padding='same')(x)
        x = Conv1D(128, 4, activation='relu', padding='same')(x)
        x = MaxPooling1D(42, padding='same')(x)
        x = Flatten()(x)
        x = Dropout(DROPOUT_RATE)(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(2, activation='sigmoid')(x)
        return sequence_input, preds

    # Compile model.
    print('compiling model')
    input_layer, output_layer = get_convolutional_neural_net_layers()
    model = Model(input_layer, output_layer)
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=LEARNING_RATE),
                  metrics=['acc'])

    # Train model.
    print('training model')
    model.fit(train_text,
              train_labels,
              batch_size=BATCH_SIZE,
              epochs=NUM_EPOCHS,
              validation_data=(validate_text, validate_labels),
              verbose=2)

    return model

model = train_model(train_df, validate_df, tokenizer)

## Generate model predictions on the validation set

In [None]:
MODEL_NAME = 'my_model'
validate_df[MODEL_NAME] = model.predict(pad_text(validate_df[TEXT_COLUMN], tokenizer))[:, 1]

In [None]:
true_labels = validate_df[TOXICITY_COLUMN]
predicted_labels = validate_df[MODEL_NAME]
print(metrics.roc_auc_score(true_labels, predicted_labels))

## Prediction on Kaggle Test data for submission to contest

In [None]:
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv', index_col='id')

In [None]:
submission['prediction'] = model.predict(pad_text(test[TEXT_COLUMN], tokenizer))[:, 1]
submission.to_csv('submission.csv')