In [None]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
trn_limit = 1400000
#tst_limit = -1
train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')[:trn_limit]
#test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')[:tst_limit]
test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

In [None]:
train_df.head()

In [None]:
EMBEDDING_FILES = [
    '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec',
    '../input/glove840b300dtxt/glove.840B.300d.txt'
]
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'


In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)


def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix
    


In [None]:
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, GRU, Embedding, concatenate, Flatten, Lambda
from keras.models import Model


def category_embedding(cat_vars):
    """

    :param cat_vars: cat_vars = [(3,50), (3,50), (3,50), (3,50), (3,50)]--> (num_categories, emb_sz)
    As a rule of thumb (fast.ai): emb_size = min(50, (num_categories+1)/2 ( or (num_diff_levels+1)/2)
    Example: days = 0..6, then emb_sz = min(50, 8/2) = 4
    :type cat_vars:
    :return:
    :rtype:
    """
    # Inputs
    input_l = Input(shape=[len(cat_vars), 1])

    # Category inputs, by slice with Lambda layer.
    # We cannot slice the tensor directly as its output will not be Keras layer.
    category = [Lambda(lambda x: x[:, i])(input_l) for i in range(len(cat_vars))]

    '''
    for i in range(len(cat_vars)):
        category.append(Lambda(lambda x: x[:, i])(input_l))
    '''

    # Apply embedding layers and get emb_outputs
    emb_category = [Embedding(cat_vars[i][0], cat_vars[i][1])(category[i]) for i in range(len(cat_vars))]

    '''
    for i in range(len(cat_vars)):
        emb_category.append(Embedding(cat_vars[i][0], cat_vars[i][1])(category[i]))
    '''



    '''
    concat_l = Flatten()(emb_category[0])
    for i in range(len(cat_vars) - 1):
        concat_l = concatenate([concat_l, Flatten()(emb_category[i + 1])])
    '''

    # We need to flatten since input is len(cat_vars),
    # 1 => so each emb_category.shape = (emb_sz,1), so we need to flatten the extra 1
    emb_outs = [Flatten()(emb_category[i]) for i in range(len(cat_vars))]

    # Concatenated layer
    concat_l = concatenate(emb_outs)
    # TODO: try average pooling, and and learnable (Dense) merge

    # model
    model = Model(input_l, concat_l)

    return model


In [None]:


x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
y_aux_train = train_df[AUX_COLUMNS].values

x_test = test_df[TEXT_COLUMN].astype(str)

for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)


sample_weights = np.ones(len(x_train), dtype=np.float32)
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()



In [None]:
len(tokenizer.word_index)

In [None]:
embedding_matrix = np.concatenate([build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

# Identity model

## Data

In [None]:
train_df[IDENTITY_COLUMNS].shape


In [None]:
train_df[IDENTITY_COLUMNS].isnull().sum()

In [None]:
id_train_df = train_df[IDENTITY_COLUMNS+[TEXT_COLUMN]].dropna()
id_train_df.shape

In [None]:
y_id_train = id_train_df[IDENTITY_COLUMNS].values
x_id_train = id_train_df[TEXT_COLUMN]

x_id_train = tokenizer.texts_to_sequences(x_id_train)
x_id_train = sequence.pad_sequences(x_id_train, maxlen=MAX_LEN)

In [None]:
print(x_id_train.shape)
print(y_id_train.shape)

In [None]:
def build_id_model(embedding_matrix, num_identity_targets):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    identity_result = Dense(num_identity_targets, activation='sigmoid')(hidden)# Multi-class
    
    model = Model(inputs=words, outputs=identity_result)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
checkpoint_predictions = []
weights = []
from keras.callbacks import ModelCheckpoint, Callback
model_name = 'id_model'
#filepath = os.path.join(gdrive_path, 'jigsaw_' + model_name + '.h5')
checkpoint = ModelCheckpoint(filepath='./'+model_name+'.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

callbacks_lst = [checkpoint]

id_model = build_id_model(embedding_matrix, y_id_train.shape[-1])
print(id_model.summary())
id_model.fit(
    x_id_train,
    y_id_train,
    batch_size=BATCH_SIZE,
    epochs=2,
    verbose=2,
    validation_split=0.2,
    callbacks=callbacks_lst
    #sample_weight=[sample_weights.values, np.ones_like(sample_weights)],
    #callbacks=[LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch))]
)



# Toxity model

In [None]:
x_train.shape

## Prepare id features input

In [None]:
#x_id_train = np.array([id_model.predict(x) for x in x_train])
x_id_train = id_model.predict(x_train)
x_id_train.shape

In [None]:
x_id_test = id_model.predict(x_test)
x_id_test.shape

## Model

In [None]:


def build_model(embedding_matrix, num_aux_targets, num_identity_targets):
    
    # Text input
    words = Input(shape=(None,))   
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)


    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
        #GlobalMaxPooling1D(x_id),
        #GlobalAveragePooling1D(x_id),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    x_text = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    
    # Identity inputs
    ids = Input(shape=(num_identity_targets,))
    x_id = Dense(DENSE_HIDDEN_UNITS)(ids)
    
    hidden = concatenate([x_id, x_text])
    result = Dense(1, activation='sigmoid')(hidden)# Binary
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)# Multi-class
    identity_result = Dense(num_identity_targets, activation='sigmoid')(hidden)# Multi-class
    
    model = Model(inputs=[words, ids], outputs=[result, aux_result])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [None]:
checkpoint_predictions = []
weights = []

from keras.callbacks import ModelCheckpoint, Callback
model_name = 'toxity_model'
#filepath = os.path.join(gdrive_path, 'jigsaw_' + model_name + '.h5')
checkpoint = ModelCheckpoint(filepath='./'+model_name+'.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

callbacks_lst = [checkpoint]

for model_idx in range(NUM_MODELS):
    model = build_model(embedding_matrix, y_aux_train.shape[-1], y_id_train.shape[-1])
    for global_epoch in range(EPOCHS):
        model.fit(
            [x_train, x_id_train],
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=2,
            sample_weight=[sample_weights.values, np.ones_like(sample_weights)],
            callbacks=[
                LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch)),
                checkpoint,
            ]
        )
        checkpoint_predictions.append(model.predict([x_test, x_id_test], batch_size=2048)[0].flatten())
        weights.append(2 ** global_epoch)

predictions = np.average(checkpoint_predictions, weights=weights, axis=0)


# Submission

In [None]:
submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': predictions
})
submission.to_csv('submission.csv', index=False)