Fit a bidirectional LSTM to the data. This code was modified from https://www.kaggle.com/thousandvoices/simple-lstm. Salient features of the LSTM are:

1. Words are preprocessed to remove strange characters
2. Two word embedding were used - GloVe and fastText (each has dimension=300)
2. LSTM was a stacked bidirectional LSTM
3. Outputs from the LSTM stacks are combined using GlobalMaxPooling1D and GlobalAveragePooling1D - These take LSTM outputs of shape (batch_size, steps, hidden_size), to make them shape (batch_size, hidden_size).
4. Finally three dense layers enable classification
5. Weighted cross enrtropy is used as the loss function
6. Learning rate was decreased every epoch


In [1]:
import numpy as np 
import pandas as pd 
from keras.preprocessing import text, sequence
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.callbacks import LearningRateScheduler
from keras import backend as K
from keras.models import load_model
import pickle
import os

Using TensorFlow backend.


['floatallallcols', 'glove840b300dtxt', 'fasttext-crawl-300d-2m']


In [3]:
EMBEDDING_FILES = [
    '../input/glove840b300dtxt/glove.840B.300d.txt',
    '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
]
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 2
MAX_LEN = 220
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [4]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)


def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix
    

def build_model(embedding_matrix, spatial_dropout):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(spatial_dropout)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
         GlobalMaxPooling1D()(x),
         GlobalAveragePooling1D()(x),
     ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [5]:
train_df = pd.read_csv('../input/floatallallcols/all/all/train.tsv', sep='\t')
dev_df = pd.read_csv('../input/floatallallcols/all/all/dev.tsv', sep='\t')

In [7]:
x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
x_test = dev_df[TEXT_COLUMN].astype(str)

train_df['target_f'] = train_df['target']

#Keep target as float, make identity ints
for column in IDENTITY_COLUMNS+['target_f']:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

In [8]:
train_df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,target_f
0,6229878,0.0,"Beyond the non-existent customer service, the ...",0.0,0.0,0.0,0.0,0.0,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2017-10-26 20:35:54.384494+00,22,6229846.0,392835,rejected,0,0,0,0,0,0.0,0,4,False
1,6116726,0.5,Lol. The Idiocracy pResidency continues...,0.0,0.0,0.0,0.3,0.0,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2017-10-10 10:49:38.942637+00,102,,387117,approved,0,0,0,0,0,0.0,0,10,True
2,6157047,0.0,"""someone had jammed a British Fantasy Series M...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0,False,False,0.0,0.0,False,0.0,False,0.0,False,False,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0,False,2017-10-16 17:11:57.729415+00,102,,389243,approved,0,0,0,0,0,0.0,4,6,False
3,719889,0.0,"""Social contract,"" ""will of the gods,"" ""divine...",0.0,0.0,0.0,0.0,0.0,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2016-12-16 20:53:59.435042+00,22,,156978,approved,0,0,0,2,2,0.0,0,4,False
4,1019477,0.166667,"Well, perhaps the money to fix this should com...",0.0,0.0,0.166667,0.0,0.0,,,,False,,False,False,,,False,,False,,False,False,,,,,,,False,,False,2017-02-20 15:11:07.041939+00,54,,169343,approved,1,0,0,4,4,0.0,0,6,False


In [9]:
# tokenize text and remove characters
tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [10]:
#save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
#add sample weights
sample_weights = np.ones(df.shape[0], dtype=np.float32)
#Increase the sample weight by the number of true identity columns
sample_weights += df[identity_cols].sum(axis=1)
#If the target is true increase the weight by the number of 'FALSE' identity columns
sample_weights += df['target'] * (~df[identity_cols]).sum(axis=1)
#If the target is false, increase the weight by the number of 'TRUE' identity columns, multiplied by 5. 
sample_weights += (~df['target']) * df[identity_cols].sum(axis=1) * 5
#Average these
sample_weights /= sample_weights.mean()

In [14]:
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

In [15]:
def run_model(spatial_dropout):
    for global_epoch in range(EPOCHS):    
        model = build_model(embedding_matrix, spatial_dropout)
        print(model.summary())        
        model.fit(
            x_train,
            y_train,
            batch_size=BATCH_SIZE,
            epochs = 1,
            verbose = 1,
            sample_weight = sample_weights.values,
            callbacks = [LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch))]
    )
    model_json = model.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("model.h5")
    print("Saved model to disk")