In [None]:
%load_ext autotime
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import numpy as np
import pandas as pd
from collections import defaultdict
import re, multiprocess, gc

import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.initializers import he_uniform
import tensorflow as tf
from keras import backend as K

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score

np.random.seed(7)

In [None]:
train = pd.read_csv('/home/bananamachine/.kaggle/competitions/movie-genres-by-dialogue/train.csv.zip')
test = pd.read_csv('/home/bananamachine/.kaggle/competitions/movie-genres-by-dialogue/X_test.csv.zip')

def clean_labels(text):
    text = re.sub('-', '', text)
    return text 

train.genres = train.genres.map(lambda x: clean_labels(x))
label_vectorizer = CountVectorizer()
train_y = label_vectorizer.fit_transform(train.genres).todense()
label_cols = label_vectorizer.get_feature_names()

In [None]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [None]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.dialogue:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [None]:
min_count = 2

docs = create_docs(train)
docs_test = create_docs(test)

tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

docs = tokenizer.texts_to_sequences(docs)
docs_test = tokenizer.texts_to_sequences(docs_test)

maxlen = 256
docs = pad_sequences(sequences=docs, maxlen=maxlen)
docs_test = pad_sequences(sequences=docs_test, maxlen=maxlen)

In [None]:
input_dim = np.max(docs) + 1
embedding_dims = 20

In [None]:
class RocAucEvaluation(Callback):
    
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data
        
    def on_train_begin(self, logs={}):
        if not ('val_auc' in self.params['metrics']):
            self.params['metrics'].append('roc_auc_val')        
        
    def on_epoch_end(self, epoch, logs={}):
        logs['val_auc'] = float('-inf')
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            logs['val_auc'] = score
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

def get_model():
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   
    return model

def val_run_model(train_X, train_y, val_X, val_y, test_X, params): 

    try: pp = multiprocess.current_process()._identity[0]
    except IndexError: pp = 0
    
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Graph().as_default():
        with tf.Session(config=config) as sess:
            K.set_session(sess)
            
            model = get_model()
            file_path, log_file_path = \
                'ft_auc_checkpoint_{}.hdf5'.format(str(pp)), \
                'ft_csv_log_{}.csv'.format(str(pp))
                
            rocauc = RocAucEvaluation(validation_data=(val_X, val_y), interval=1)
            lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=3, min_lr=0.5e-6)
            early_stopper = EarlyStopping(monitor='val_auc', mode='max', patience=6)
            csv_logger = CSVLogger(log_file_path)
            checkpoint = ModelCheckpoint(file_path, monitor='val_auc', verbose=1, save_best_only=True, mode='max')           
            callbacks_list = [rocauc, lr_reducer, early_stopper, csv_logger, checkpoint]
            
            model.fit(train_X, train_y, batch_size=params['bs'], epochs=40, validation_data=(val_X, val_y), verbose=2, callbacks=callbacks_list)
            model.load_weights(file_path)
            pred_val_y = model.predict(val_X, batch_size=512)
            pred_test_y = model.predict(test_X, batch_size=512)        

            sess.close()
            del sess, model, lr_reducer, early_stopper, checkpoint, csv_logger; gc.collect();
    return pred_val_y, pred_test_y                  
  
def val_predict_fold(i, dev_index, val_index, params): 
    pred_train = np.zeros(len(docs))
    dev_X, val_X = docs[dev_index], docs[val_index]
    dev_y, val_y = np.squeeze(np.array(train_y[:, i][dev_index])), np.squeeze(np.array(train_y[:, i][val_index]))
    pred_val_y, pred_test_y = val_run_model(dev_X, dev_y, val_X, val_y, docs_test, params)
    pred_train[val_index] = np.squeeze(np.array(pred_val_y))
    cv_scores = roc_auc_score(val_y, pred_val_y)
    return cv_scores, pred_train, pred_test_y

In [None]:
def val_predict_oof(i, n, params):  
    if __name__ == '__main__':
        kf = StratifiedKFold(n_splits=n, shuffle=True, random_state=0)
        with multiprocess.Pool(n) as p:
            r = list(p.imap(lambda x: val_predict_fold(i, x[0], x[1], params), kf.split(train_y[:, i], train_y[:, i]), chunksize=1))
    return r, np.mean([x[0] for x in r]), np.sum([x[1] for x in r], axis=0), np.sum([x[2] for x in r], axis=0) / float(n)

In [None]:
params = {
    'bs': 256,
}

preds_test = np.zeros((docs_test.shape[0], len(label_cols)))
preds_train = np.zeros((docs.shape[0], len(label_cols)))
cv_scores = list()
rs = []

for i, j in enumerate(label_cols):
    print('cv', j)
    r, scores, pred_train, pred_test = val_predict_oof(i, 5, params)
    preds_train[:, i] = pred_train
    preds_test[:, i] = pred_test.flatten()
    cv_scores.append(np.mean(scores))
    print(np.mean(scores))
    
print(np.mean(cv_scores))

In [None]:
for i in range(20):
    train['pred_ft0_{}'.format(str(i))] = preds_train[:, i]
    test['pred_ft0_{}'.format(str(i))] = preds_test[:, i]
    
train.iloc[:, -20:].to_csv('predictions/pred_train_ft0.csv', index=False)
test.iloc[:, -20:].to_csv('predictions/pred_test_ft0.csv', index=False)