In [None]:
%load_ext autotime
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import numpy as np
np.random.seed(42)
import pandas as pd
import re, string, multiprocess, pickle, tqdm, gc, warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import ParameterSampler
from sklearn.feature_extraction.text import CountVectorizer

from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, CuDNNGRU, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from keras.initializers import he_uniform
import tensorflow as tf
from keras import backend as K

def clean_text(text):
    text = text.lower()
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text 

def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

def lemmatize(data):
    if __name__ == "__main__": 
        with multiprocess.Pool() as p:
            result = list(tqdm.tqdm_notebook(p.imap(lambda x: " ".join(lemmatize_all(clean_text(str(x)))), 
                                                    iter(data), chunksize=100), total=len(data)))
        return result

In [None]:
# EMBEDDING_FILE = '../../../Embeddings/EN/BPE/embeddings/en.wiki.bpe.op3000.d300.w2v.txt'
EMBEDDING_FILE = '../../../Embeddings/crawl-300d-2M.vec'
# EMBEDDING_FILE = '../../../Embeddings/glove.840B.300d.txt'

train = pd.read_csv('/home/bananamachine/.kaggle/competitions/movie-genres-by-dialogue/train.csv.zip')
test = pd.read_csv('/home/bananamachine/.kaggle/competitions/movie-genres-by-dialogue/X_test.csv.zip')

def clean_labels(text):
    text = re.sub('-', '', text)
    return text 

train.genres = train.genres.map(lambda x: clean_labels(x))
label_vectorizer = CountVectorizer()
train_y = label_vectorizer.fit_transform(train.genres).todense()
label_cols = label_vectorizer.get_feature_names()

In [None]:
train.dialogue = lemmatize(train.dialogue)
test.dialogue = lemmatize(test.dialogue)

X_train = train['dialogue'].fillna('fillna').values
X_test = test['dialogue'].fillna('fillna').values

In [None]:
max_features = 100000
maxlen = 200
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
def build_embeddings_index(embeddings_path, embeddings_size):
    index = {}
    with open(embeddings_path, 'rb') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if vector.shape[0] != embeddings_size:
                pass
            else:
                index[word] = vector
    return index

embeddings_index = build_embeddings_index(EMBEDDING_FILE, embed_size)

In [None]:
# for k, x in zip(embeddings_index.keys(), embeddings_index.values()):
#     if x.shape[0] != embed_size:
#         print(k)

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
print(emb_mean, emb_std)
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
print(embedding_matrix.shape[0])

In [None]:
class RocAucEvaluation(Callback):
    
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data
        
    def on_train_begin(self, logs={}):
        if not ('val_auc' in self.params['metrics']):
            self.params['metrics'].append('roc_auc_val')        
        
    def on_epoch_end(self, epoch, logs={}):
        logs['val_auc'] = float('-inf')
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            logs['val_auc'] = score
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
def get_model(params):
    initializer = he_uniform(seed=0)
    inp = Input(shape=(maxlen, ))
    x = Embedding(embedding_matrix.shape[0], embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(params['sd0'])(x)
    x = Bidirectional(CuDNNGRU(params['runits'], recurrent_initializer = initializer, return_sequences=True))(x)
    x = Dropout(params['d0'])(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation='sigmoid')(conc)
    with tf.device('/cpu:1'):
        model = Model(inputs=inp, outputs=outp)    
    return model

def val_run_model(train_X, train_y, val_X, val_y, test_X, params): 

    try: pp = multiprocess.current_process()._identity[0]
    except IndexError: pp = 0
    
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Graph().as_default():
        with tf.Session(config=config) as sess:
            K.set_session(sess)
            
            model = get_model(params)
            model.compile(loss='binary_crossentropy', optimizer='adam')
            
            file_path, log_file_path = 'rnna_auc_checkpoint_{}.hdf5'.format(str(pp)), 'rnna_csv_log_{}.csv'.format(str(pp))
            rocauc = RocAucEvaluation(validation_data=(val_X, val_y), interval=1)
            lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=3, min_lr=0.5e-6)
            early_stopper = EarlyStopping(monitor='val_auc', mode='max', patience=6)
            csv_logger = CSVLogger(log_file_path)
            checkpoint = ModelCheckpoint(file_path, monitor='val_auc', verbose=1, save_best_only=True, mode='max')           
            callbacks_list = [rocauc, lr_reducer, early_stopper, csv_logger, checkpoint]
            
            model.fit(train_X, train_y, batch_size=params['bs'], epochs=40, validation_data=(val_X, val_y), verbose=2, callbacks=callbacks_list)
            model.load_weights(file_path)
            pred_val_y = model.predict(val_X, batch_size=512)
            pred_test_y = model.predict(test_X, batch_size=512)        

            sess.close()
            del sess, model, lr_reducer, early_stopper, checkpoint, csv_logger; gc.collect();
    return pred_val_y, pred_test_y                  
  
def val_predict_fold(i, dev_index, val_index, params): 
    pred_train = np.zeros(len(X_train))
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = np.squeeze(np.array(train_y[:, i][dev_index])), np.squeeze(np.array(train_y[:, i][val_index]))
    pred_val_y, pred_test_y = val_run_model(dev_X, dev_y, val_X, val_y, x_test, params)
    pred_train[val_index] = np.squeeze(np.array(pred_val_y))
    cv_scores = roc_auc_score(val_y, pred_val_y)
    return cv_scores, pred_train, pred_test_y

In [None]:
def val_predict_oof(i, n, params):  
    if __name__ == '__main__':
        kf = StratifiedKFold(n_splits=n, shuffle=True, random_state=0)
        with multiprocess.Pool(n) as p:
            r = list(p.imap(lambda x: val_predict_fold(i, x[0], x[1], params), kf.split(train_y[:, i], train_y[:, i]), chunksize=1))
    return r, np.mean([x[0] for x in r]), np.sum([x[1] for x in r], axis=0), np.sum([x[2] for x in r], axis=0) / float(n)

In [None]:
params = {
    'sd0': 0.2,
    'runits': 60,
    'd0': 0.2,
    'bs': 256,
}

preds_test = np.zeros((X_test.shape[0], len(label_cols)))
preds_train = np.zeros((X_train.shape[0], len(label_cols)))
cv_scores = list()

for i, j in enumerate(label_cols):
    print('cv', j)
    r, scores, pred_train, pred_test = val_predict_oof(i, 5, params)
    preds_train[:, i] = pred_train
    preds_test[:, i] = pred_test.flatten()
    cv_scores.append(np.mean(scores))
    print(np.mean(scores))
    
print(np.mean(cv_scores))

In [None]:
for i in range(20):
    train['pred_rnnft0_{}'.format(str(i))] = preds_train[:, i]
    test['pred_rnnft0_{}'.format(str(i))] = preds_test[:, i]
    
train.iloc[:, -20:].to_csv('predictions/pred_train_rnnft0.csv', index=False)
test.iloc[:, -20:].to_csv('predictions/pred_test_rnnft0.csv', index=False)