In [None]:
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np

import multiprocess, pickle, warnings, re, gc
from operator import itemgetter
from typing import List, Dict
warnings.filterwarnings('ignore')
%load_ext autotime

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.metrics import log_loss, roc_auc_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression

from scipy import sparse

def on_field(f: str, *vec) -> Pipeline:
    return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)

In [None]:
train = pd.read_csv('/home/bananamachine/.kaggle/competitions/movie-genres-by-dialogue/train.csv.zip')
test = pd.read_csv('/home/bananamachine/.kaggle/competitions/movie-genres-by-dialogue/X_test.csv.zip')

def clean_labels(text):
    text = re.sub('-', '', text)
    return text 

train.genres = train.genres.map(lambda x: clean_labels(x))

label_vectorizer = CountVectorizer()
train_y = label_vectorizer.fit_transform(train.genres).todense()
label_cols = label_vectorizer.get_feature_names()

In [None]:
vectorizer = make_union(
    on_field('dialogue', TfidfVectorizer(sublinear_tf=True, token_pattern='\w+')),
)

train_X = vectorizer.fit_transform(train)
test_X = vectorizer.transform(test)
print(train_X.shape, test_X.shape)

In [None]:
import tensorflow as tf
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras import backend as K
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras import regularizers

In [None]:
class RocAucEvaluation(Callback):
    
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data
        
    def on_train_begin(self, logs={}):
        if not ('val_auc' in self.params['metrics']):
            self.params['metrics'].append('roc_auc_val')        
        
    def on_epoch_end(self, epoch, logs={}):
        logs['val_auc'] = float('-inf')
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            logs['val_auc'] = score
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

def get_model(params):
    inp = Input(shape=(train_X.shape[1],), dtype='float32', sparse=True)
    out = Dense(params['d0'], activation='relu', )(inp)
    out = Dropout(params['dr0'])(out)
    out = Dense(params['d1'], activation='relu', )(out)
    out = Dropout(params['dr1'])(out)                    
    out = Dense(1, activation='sigmoid')(out)
    with tf.device('/cpu:0'):
        model = Model(inp, out)     
    return model

def val_run_model(train_X, train_y, val_X, val_y, test_X, params): 

    try: pp = multiprocess.current_process()._identity[0]
    except IndexError: pp = 0

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Graph().as_default():
        with tf.Session(config=config) as sess:
            K.set_session(sess)
            
            model = get_model(params)
            model.compile(loss='binary_crossentropy', optimizer='adam')
            
            file_path, log_file_path = 'mlpword_auc_checkpoint_{}.hdf5'.format(str(pp)), 'mlpword_csv_log_{}.csv'.format(str(pp))
            rocauc = RocAucEvaluation(validation_data=(val_X, val_y), interval=1)
            lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=3, min_lr=0.5e-6)
            early_stopper = EarlyStopping(monitor='val_auc', mode='max', patience=6)
            csv_logger = CSVLogger(log_file_path)
            checkpoint = ModelCheckpoint(file_path, monitor='val_auc', verbose=0, save_best_only=True, mode='max')           
            callbacks_list = [rocauc, lr_reducer, early_stopper, csv_logger, checkpoint]
            
            model.fit(train_X, train_y, batch_size=params['bs'], epochs=40, validation_data=(val_X, val_y), verbose=0, callbacks=callbacks_list)
            model.load_weights(file_path)
            pred_val_y = model.predict(val_X, batch_size=512)
            pred_test_y = model.predict(test_X, batch_size=512)

            sess.close()
            del sess, model, lr_reducer, early_stopper, checkpoint, csv_logger; gc.collect();
    return pred_val_y, pred_test_y                  
  
def val_predict_fold(i, dev_index, val_index, params): 
    pred_train = np.zeros(train_X.shape[0])
    dev_X, val_X = train_X[dev_index], train_X[val_index]
    dev_y, val_y = np.squeeze(np.array(train_y[:, i][dev_index])), np.squeeze(np.array(train_y[:, i][val_index]))
    pred_val_y, pred_test_y = val_run_model(dev_X, dev_y, val_X, val_y, test_X, params)
    pred_train[val_index] = np.squeeze(np.array(pred_val_y))
    cv_scores = roc_auc_score(val_y, pred_val_y)
    return cv_scores, pred_train, pred_test_y

In [None]:
def val_predict_oof(i, n, params):  
    if __name__ == '__main__':
        kf = StratifiedKFold(n_splits=n, shuffle=True, random_state=0)
        with multiprocess.Pool(n) as p:
            r = list(p.imap(lambda x: val_predict_fold(i, x[0], x[1], params), kf.split(train_y[:, i], train_y[:, i]), chunksize=1))
    return r, np.mean([x[0] for x in r]), np.sum([x[1] for x in r], axis=0), np.sum([x[2] for x in r], axis=0) / float(n)

In [None]:
params = {
    'd0': 512,
    'dr0': .6,
    'd1': 256,
    'dr1': .5,
    'bs': 512,
}

preds_test = np.zeros((test_X.shape[0], len(label_cols)))
preds_train = np.zeros((train_X.shape[0], len(label_cols)))
cv_scores = list()

for i, j in enumerate(label_cols):
    print('cv', j)
    r, scores, pred_train, pred_test = val_predict_oof(i, 5, params)
    preds_train[:, i] = pred_train
    preds_test[:, i] = pred_test.flatten()
    cv_scores.append(np.mean(scores))
    print(np.mean(scores))
    
print(np.mean(cv_scores))

In [None]:
for i in range(20):
    train['pred_mlp0_{}'.format(str(i))] = preds_train[:, i]
    test['pred_mlp0_{}'.format(str(i))] = preds_test[:, i]
    
train.iloc[:, -20:].to_csv('predictions/pred_train_mlp0.csv', index=False)
test.iloc[:, -20:].to_csv('predictions/pred_test_mlp0.csv', index=False)