In [1]:
import warnings
warnings.filterwarnings("ignore")
import os
import itertools

import numpy as np
import random as rn
import pandas as pd
import tensorflow as tf

import pickle

import random

from sklearn.externals import joblib
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from gensim.models import KeyedVectors
from keras.layers import Dense, GRU, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPooling1D, AveragePooling1D
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
# setting seed to take care of reproducibility issues (unfortunately only partial solution)
# ref: https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development
def set_seed(s):
    np.random.seed(s)
    rn.seed(s)
    tf.set_random_seed(s)
    os.environ['PYTHONHASHSEED'] = '0'

# helper function to get the name of the train column in the preprocessed dataset 
# ref: preprocessing.ipynb
def get_col_name(maxlen=150, max_features=30000, cut=False, lower=False):
    if lower:
        if cut:
            postfix = 'lower_cut'
        else:
            postfix = 'lower'
    else:
        if cut:
            postfix = 'upper_cut'
        else:
            postfix = 'upper'
    
    name = '{}_{}_{}'.format(maxlen, max_features, postfix)
    return name

# helper function to read word embeddings
def get_entry(word,*arr): 
    try:
        array = np.asarray(arr, dtype='float32')
        return word, array
    except Exception:
        return 'None', 'None'

# helper function to restore preprocessed values
def parse_to_ints(line): return list(map(int, line[1:-1].split())) 

# reading embeddings from a file and transforming them into dict - {word:vector}
def get_embeddings_index(filename):
    index = dict(get_entry(*o.strip().split()) for o in open(embeddings_path+filename))
    embeddings_index = {k:v for k,v in index.items() if len(v) == dim}
    return embeddings_index

# transforming embedding index into embedding matrix with shape (max_features, dim) 
# that will serve as an initialization to the Embedding layer of LSTM or GRU neural network 
# dim is a globally defined variable
def get_embedding_matrix(embeddings_index, tokenizer):
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, dim))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector    
    return embedding_matrix

# compiling LSTM model
# dim is a globally defined variable
def get_model_lstm(embedding_matrix, n=200, dropout=0.2, recurrent_dropout=0.1, pool='global_max'):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, dim, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(n, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(x)
    #x = LSTM(n, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout)(x)
    
    if pool == 'global_max':
        x = GlobalMaxPool1D()(x)
    elif pool == 'global_average':
        x = GlobalAveragePooling1D()(x)
        
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# compiling GRU model
# dim is a globally defined variable
def get_model_gru(embedding_matrix, n=200, dropout=0.2, recurrent_dropout=0.1, pool='global_max'):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, dim, weights=[embedding_matrix])(inp)
    x = Bidirectional(GRU(n, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout))(x)
    #x = GRU(n, return_sequences=True, dropout=dropout, recurrent_dropout=recurrent_dropout)(x)
    
    if pool == 'global_max':
        x = GlobalMaxPool1D()(x)
    elif pool == 'global_average':
        x = GlobalAveragePooling1D()(x)
        
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# helper function to get performance evaluation
# result_df is a DataFrame with predefined columns
# prefix is a list of any values that should be written before performance stats:
# example: prefix = [n,recurrent_dropout,dropout,pooling]
def print_performance_stats(y_test, y_pred, result_df, prefix, verbose=0):
    
    scores = []
    for c in list_classes:
        precision = precision_score(y_test[c], y_pred[c].round())
        recall = recall_score(y_test[c], y_pred[c].round())
        f1 = f1_score(y_test[c], y_pred[c].round())
        score = roc_auc_score(y_test[c], y_pred[c])
        scores.append(score)
        
        prefix += [score, precision, recall, f1]
    
        if verbose:
            print('roc_auc_score score on {0}: {1}'.format(c, score))
            print('Precision/Recall on {0}: {1}/{2}'.format(c, precision,recall))
            print('F1 score on {0}: {1}'.format(c, f1))
            print('')
    
    final_score = sum(scores)/6
    prefix.append(final_score)
    
    res = {k:v for k,v in zip(columns, prefix)}
    result_df = result_df.append(res, ignore_index=True)
    
    print('Final roc_auc_score score: {0}'.format(final_score))
    
    return result_df
    
# helper function to get train-test split based on (1) predefined in preprocessing stage mask
# and parameters like augmentation, lowercase etc.
def get_train_test(augmented, cname):
    
    train = pd.read_csv(TRAIN_DATA_FILE)
    if not augmented:
        train = train[train['lang'] == 'en']
    
    print('Size of the dataset: {}'.format(len(train)))

    train[cname] = train[cname].apply(parse_to_ints)

    X_train = np.array(list(train[cname][train['split']]))
    X_test = np.array(list(train[cname][~train['split']]))

    y = train[list_classes].values
    y_train = y[train['split']]
    y_test = train[list_classes][~train['split']]
    return X_train, X_test, y_train, y_test

**BASIC SETUP** 

In [3]:
# preprocessed data file with transformed and padded text
TRAIN_DATA_FILE = 'data/train_pre.csv'

In [4]:
#word2vec - https://code.google.com/archive/p/word2vec/
#glove - https://nlp.stanford.edu/projects/glove/
#fastText - https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
model_path = 'models/'
embeddings_path = 'emdeddings/'
google_news_300_w2v = 'GoogleNews-vectors-negative300.bin'
wiki_300_ft = 'wiki.en.vec'
twitter_100_glove = 'glove.twitter.27B.100d.txt'
twitter_200_glove = 'glove.twitter.27B.200d.txt'
wiki_100_glove = 'glove.6B.100d.txt'
wiki_200_glove = 'glove.6B.200d.txt'
wiki_300_glove = 'glove.6B.300d.txt'
crawl_300_glove = 'glove.840B.300d.txt'

In [5]:
list_classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [12]:
basic_columns = ['toxic_auc','toxic_precision','toxic_recall','toxic_f1',
                 'severe_auc','severe_precision','severe_recall','severe_f1',
                 'obscene_auc','obscene_precision','obscene_recall','obscene_f1',
                 'threat_auc','threat_precision','threat_recall','threat_f1',
                 'insult_auc','insult_precision','insult_recall','insult_f1',
                 'identity_auc','identity_precision','identity_recall','identity_f1',
                 'final_auc']

**RUNNING EXPERIMENTS WITH VARIOUS PREPROCESSING PARAMETERS (MAXLEN, MAX_FEATURES etc)** 

In [7]:
n_epochs = 2

In [8]:
# defining word embeddings
dim = 100
name = wiki_100_glove
name_for_print = 'wiki_100_glove'
embeddings_index = get_embeddings_index(name)

In [9]:
columns = ['maxlen','max_features','augmented','cut','lower'] + basic_columns
results = pd.DataFrame(columns=columns)

In [10]:
# parameters to be explored
params = {}
params['maxlen'] = [200] #[150,200,250,300]
params['max_features'] = [50000] #[30000,40000,50000,60000]
params['augmented'] = [True] #[True, False]
params['is_lower'] = [True] #[True, False]
params['is_cut'] = [False] #[True, False]

param_names = ['maxlen','max_features','augmented','is_lower','is_cut']
combinations = itertools.product(*(params[name] for name in param_names))
combinations = list(combinations)

In [11]:
# before running this cell ensure that for every combination there is a preprocessed column in the TRAIN_DATA_FILE
# and specific tokenizer in the pickle folder
for i, comb in enumerate(combinations):
    
    print('\n\nRunning experiment {}/{}'.format(i+1, len(combinations)))
    print('Combination {}'.format(comb))
    print()
    
    maxlen = comb[0]
    max_features = comb[1]
    augmented = comb[2]
    lower = comb[3]
    cut = comb[4]
    
    name = get_col_name(maxlen, max_features, cut, lower)
    cname = 'padded_{}'.format(name)
    print('Column name: {}'.format(cname))
    TOKENIZER_PICKLE = 'pickles/tokenizer_{}.pickle'.format(name)
    
    with open(TOKENIZER_PICKLE, 'rb') as handle:
        tokenizer = pickle.load(handle)
    
    X_train, X_test, y_train, y_test = get_train_test(augmented, cname)
        
    assert(tokenizer.__dict__['num_words'] == max_features)
    assert(X_train.shape[1] == maxlen)
    
    set_seed(29)
    embedding_matrix = get_embedding_matrix(embeddings_index, tokenizer)
    model = get_model_lstm(embedding_matrix, n=100, dropout=0.3, recurrent_dropout=0.1, pool='global_max')
    model.fit(X_train, y_train, batch_size=64, epochs=n_epochs, validation_split=0.1)
    
    y_pred = model.predict([X_test], batch_size=1024, verbose=1)
    y_pred = pd.DataFrame(y_pred)
    y_pred.columns = list_classes
    
    prefix = [maxlen,max_features,augmented,cut,lower]
    
    results = print_performance_stats(y_test, y_pred, result_df=results, prefix=prefix, verbose=0)
    results.to_csv('results/results_{}.csv'.format(name_for_print), index=False)



Running experiment 1/1
Combination (200, 50000, True, True, False)

Column name: padded_200_50000_lower
Size of the dataset: 160914
Train on 123165 samples, validate on 13686 samples
Epoch 1/2
Epoch 2/2
Final roc_auc_score score: 0.9815273291281138


**RUNNING EXPERIMENTS WITH VARIOUS EMBEDDINGS** 

In [12]:
n_epochs = 2

In [13]:
columns = ['embeddings'] + basic_columns
results = pd.DataFrame(columns=columns)

In [14]:
embeddings = [(twitter_200_glove, 200)] 
#[(wiki_100_glove, 100), (wiki_200_glove, 200), (wiki_300_glove, 300), (wiki_300_ft, 300), 
#(twitter_100_glove, 100), (twitter_200_glove, 200), (crawl_300_glove, 300)]

In [28]:
# using the best params defined in the previous step
maxlen = 200
max_features = 50000
augmented = True
lower = True
cut = False 

name = get_col_name(maxlen, max_features, cut, lower)
cname = 'padded_{}'.format(name)
print('Column name: {}'.format(cname))
TOKENIZER_PICKLE = 'pickles/tokenizer_{}.pickle'.format(name)
X_train, X_test, y_train, y_test = get_train_test(augmented, cname)

with open(TOKENIZER_PICKLE, 'rb') as handle:
    tokenizer = pickle.load(handle)
    
assert(tokenizer.__dict__['num_words'] == max_features)
assert(X_train.shape[1] == maxlen)

Column name: padded_200_50000_lower
Size of the dataset: 160914


In [16]:
for i, embed in enumerate(embeddings):
    
    print('\n\nRunning experiment {}/{}'.format(i+1, len(embeddings)))
    print('Embeddings: {}'.format(embed))
    print()
            
    name = embed[0]
    dim = embed[1]
    embeddings_index = get_embeddings_index(name)
           
    set_seed(29)
    embedding_matrix = get_embedding_matrix(embeddings_index, tokenizer)
    
    model = get_model_lstm(embedding_matrix, n=100, dropout=0.3, recurrent_dropout=0.1, pool='global_max')
    model.fit(X_train, y_train, batch_size=64, epochs=n_epochs, validation_split=0.1)
    
    y_pred = model.predict([X_test], batch_size=1024, verbose=1)
    y_pred = pd.DataFrame(y_pred)
    y_pred.columns = list_classes 
        
    prefix = [name]
    
    results = print_performance_stats(y_test, y_pred, result_df=results, prefix=prefix, verbose=0)
    results.to_csv('results/embeddings.csv', index=False)



Running experiment 1/1
Embeddings: ('glove.twitter.27B.200d.txt', 200)

Train on 123165 samples, validate on 13686 samples
Epoch 1/2
Epoch 2/2
Final roc_auc_score score: 0.9861440821621373


**EXPERIMENTING WITH ARCHITECHTURE** 

In [17]:
n_epochs = 2

In [18]:
columns = ['n','recurrent_dropout','dropout','pooling'] + basic_columns
results = pd.DataFrame(columns=columns)

In [19]:
# using the best params defined in the previous steps 
name = twitter_200_glove
dim = 200
embeddings_index = get_embeddings_index(name)
embedding_matrix = get_embedding_matrix(embeddings_index, tokenizer)

In [20]:
params = {}
params['n'] = [200] #[50, 75, 100, 125, 150, 175, 200, 225, 250]
params['recurrent_dropout'] = [0.1] #[0.1, 0.2, 0.3, 0.4]
params['dropout'] = [0.2]  #[0.1, 0.2, 0.3, 0.4] 
params['pooling'] = ['global_max'] #['global_max', 'global_average'] 

param_names = ['n','recurrent_dropout','dropout', 'pooling']
combinations = itertools.product(*(params[name] for name in param_names))
combinations = list(combinations)

In [21]:
for i, comb in enumerate(combinations):
    
    print('\n\nRunning experiment {}/{}'.format(i+1, len(combinations)))
    print('Combination: {}'.format(comb))
    print()
    
    n = comb[0]
    recurrent_dropout = comb[1]
    dropout = comb[2]
    pool = comb[3]
           
    set_seed(29)
    model = get_model_lstm(embedding_matrix, n=n, dropout=dropout, recurrent_dropout=recurrent_dropout, pool=pool)
    model.fit(X_train, y_train, batch_size=64, epochs=n_epochs, validation_split=0.1)
    
    y_pred = model.predict([X_test], batch_size=1024, verbose=1)
    y_pred = pd.DataFrame(y_pred)
    y_pred.columns = list_classes 
    
    prefix = [n,recurrent_dropout,dropout,pool]
    
    results = print_performance_stats(y_test, y_pred, result_df=results, prefix=prefix, verbose=0)
    results.to_csv('results/results_arch.csv'.format(name_for_print), index=False)



Running experiment 1/1
Combination: (200, 0.1, 0.2, 'global_max')

Train on 123165 samples, validate on 13686 samples
Epoch 1/2
Epoch 2/2
Final roc_auc_score score: 0.9865930692161983


**TRAINING AND SAVING THE FINAL MODEL** 

In [13]:
columns = basic_columns

In [7]:
# redefining the optimal parameters
maxlen = 200
max_features = 50000
augmented = True
lower = True
cut = False 

n = 200
recurrent_dropout = 0.1
dropout = 0.2
pool = 'global_max'

embed = twitter_200_glove
dim = 200

In [8]:
name = get_col_name(maxlen, max_features, cut, lower)
cname = 'padded_{}'.format(name)
print('Column name: {}'.format(cname))
TOKENIZER_PICKLE = 'pickles/tokenizer_{}.pickle'.format(name)
X_train, X_test, y_train, y_test = get_train_test(augmented, cname)

with open(TOKENIZER_PICKLE, 'rb') as handle:
    tokenizer = pickle.load(handle)
    
assert(tokenizer.__dict__['num_words'] == max_features)
assert(X_train.shape[1] == maxlen)

Column name: padded_200_50000_lower
Size of the dataset: 160914


In [24]:
index = dict(get_entry(*o.strip().split()) for o in open(embeddings_path+embed))
embeddings_index = {k:v for k,v in index.items() if len(v) == dim}
embedding_matrix = get_embedding_matrix(embeddings_index, tokenizer)
model = get_model_lstm(embedding_matrix, n=n, dropout=dropout, recurrent_dropout=recurrent_dropout, pool=pool)

In [25]:
# http://parneetk.github.io/blog/neural-networks-in-keras/
set_seed(29)
#earlystop = EarlyStopping(monitor='val_acc', min_delta=0.0001, patience=5, verbose=1, mode='auto')
#callbacks_list = [earlystop]
model.fit(X_train, y_train, batch_size=64, epochs=2, validation_split=0.1) # callbacks=callbacks_list

Train on 123165 samples, validate on 13686 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11706e278>

In [26]:
y_pred = model.predict([X_test], batch_size=1024, verbose=1)
y_pred = pd.DataFrame(y_pred)
y_pred.columns = list_classes 
results = print_performance_stats(y_test, y_pred, result_df=pd.DataFrame(), prefix=[], verbose=0)

Final roc_auc_score score: 0.9860371756179931


In [27]:
model.save('models/final_model.h5')