In [None]:
%load_ext autotime
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np

import multiprocess, pickle, warnings, re
from operator import itemgetter
from typing import List, Dict
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression

from scipy import sparse

def on_field(f: str, *vec) -> Pipeline:
    return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [None]:
train = pd.read_csv('/home/bananamachine/.kaggle/competitions/movie-genres-by-dialogue/train.csv.zip')
test = pd.read_csv('/home/bananamachine/.kaggle/competitions/movie-genres-by-dialogue/X_test.csv.zip')

def clean_labels(text):
    text = re.sub('-', '', text)
    return text 

train.genres = train.genres.map(lambda x: clean_labels(x))
label_vectorizer = CountVectorizer()
train_y = label_vectorizer.fit_transform(train.genres).todense()
label_cols = label_vectorizer.get_feature_names()

In [None]:
vectorizer = make_union(
    on_field('dialogue', TfidfVectorizer(sublinear_tf=True, token_pattern='\w+')),
    on_field('dialogue', TfidfVectorizer(sublinear_tf=True, analyzer='char', ngram_range=(2,5))),
)

train_X = vectorizer.fit_transform(train)
test_X = vectorizer.transform(test)
print(train_X.shape, test_X.shape)

In [None]:
def val_predict_fold(i, dev_index, val_index, params):   
    pred_train_y = np.zeros(train_X.shape[0])
    dev_X, val_X = train_X[dev_index], train_X[val_index]
    dev_y, val_y = np.squeeze(np.array(train_y[:, i][dev_index])), np.squeeze(np.array(train_y[:, i][val_index]))
    model = LogisticRegression(**params)
    model.fit(dev_X, dev_y)
    pred_val_y = model.predict_proba(val_X)[:, 1]
    pred_train_y[val_index] = pred_val_y
    pred_test_y = model.predict_proba(test_X)[:, 1]
    score = roc_auc_score(val_y, pred_val_y)
    return score, pred_train_y, pred_test_y

def val_predict_oof(i, n, params):  
    if __name__ == '__main__':
        kf = StratifiedKFold(n_splits=n, shuffle=True, random_state=0)
        with multiprocess.Pool(n) as p:
            r = list(p.imap(lambda x: val_predict_fold(i, x[0], x[1], params), kf.split(train_y[:, i], train_y[:, i]), chunksize=1))
    return r, np.mean([x[0] for x in r]), np.sum([x[1] for x in r], axis=0), np.sum([x[2] for x in r], axis=0) / float(n)

In [None]:
params = {'C': 1.}
preds_test = np.zeros((test_X.shape[0], len(label_cols)))
preds_train = np.zeros((train_X.shape[0], len(label_cols)))
cv_scores = list()

for i, j in enumerate(label_cols):
    
    print('cv', j)
    r, scores, pred_train, pred_test = val_predict_oof(i, 5, params)
    preds_train[:, i] = pred_train
    preds_test[:, i] = pred_test
    cv_scores.append(np.mean(scores))
    print(np.mean(scores))
    
print(np.mean(cv_scores))

In [None]:
for i in range(20):
    train['pred_lr0_{}'.format(str(i))] = preds_train[:, i]
    test['pred_lr0_{}'.format(str(i))] = preds_test[:, i]
    
train.iloc[:, -20:].to_csv('predictions/pred_train_lr0.csv', index=False)
test.iloc[:, -20:].to_csv('predictions/pred_test_lr0.csv', index=False)

In [None]:
def val_predict_fold_nb(i, dev_index, val_index, params):   
    pred_train_y = np.zeros(train_X.shape[0])
    dev_X, val_X = train_X[dev_index], train_X[val_index]
    dev_y, val_y = np.squeeze(np.array(train_y[:, i][dev_index])), np.squeeze(np.array(train_y[:, i][val_index]))
    model = NbSvmClassifier(**params)
    model.fit(dev_X, dev_y)
    pred_val_y = model.predict_proba(val_X)[:, 1]
    pred_train_y[val_index] = pred_val_y
    pred_test_y = model.predict_proba(test_X)[:, 1]
    score = roc_auc_score(val_y, pred_val_y)
    return score, pred_train_y, pred_test_y

def val_predict_oof_nb(i, n, params):  
    if __name__ == '__main__':
        kf = StratifiedKFold(n_splits=n, shuffle=True, random_state=0)
        with multiprocess.Pool(n) as p:
            r = list(p.imap(lambda x: val_predict_fold_nb(i, x[0], x[1], params), kf.split(train_y[:, i], train_y[:, i]), chunksize=1))
    return r, np.mean([x[0] for x in r]), np.sum([x[1] for x in r], axis=0), np.sum([x[2] for x in r], axis=0) / float(n)

In [None]:
params = {'C': 1.}
preds_test = np.zeros((test_X.shape[0], len(label_cols)))
preds_train = np.zeros((train_X.shape[0], len(label_cols)))
cv_scores = list()

for i, j in enumerate(label_cols):
    
    print('cv', j)
    r, scores, pred_train, pred_test = val_predict_oof_nb(i, 5, params)
    preds_train[:, i] = pred_train
    preds_test[:, i] = pred_test
    cv_scores.append(np.mean(scores))
    print(np.mean(scores))
    
print(np.mean(cv_scores))

In [None]:
for i in range(20):
    train['pred_nb0_{}'.format(str(i))] = preds_train[:, i]
    test['pred_nb0_{}'.format(str(i))] = preds_test[:, i]
    
train.iloc[:, -20:].to_csv('predictions/pred_train_nb0.csv', index=False)
test.iloc[:, -20:].to_csv('predictions/pred_test_nb0.csv', index=False)

>### 10 folds

In [None]:
params = {'C': 1.}
preds_test = np.zeros((test_X.shape[0], len(label_cols)))
preds_train = np.zeros((train_X.shape[0], len(label_cols)))
cv_scores = list()

for i, j in enumerate(label_cols):
    
    print('cv', j)
    r, scores, pred_train, pred_test = val_predict_oof(i, 10, params)
    preds_train[:, i] = pred_train
    preds_test[:, i] = pred_test
    cv_scores.append(np.mean(scores))
    print(np.mean(scores))
    
print(np.mean(cv_scores))

In [None]:
for i in range(20):
    train['pred_lr1_{}'.format(str(i))] = preds_train[:, i]
    test['pred_lr1_{}'.format(str(i))] = preds_test[:, i]
    
train.iloc[:, -20:].to_csv('predictions/pred_train_lr1.csv', index=False)
test.iloc[:, -20:].to_csv('predictions/pred_test_lr1.csv', index=False)

In [None]:
params = {'C': 1.}
preds_test = np.zeros((test_X.shape[0], len(label_cols)))
preds_train = np.zeros((train_X.shape[0], len(label_cols)))
cv_scores = list()

for i, j in enumerate(label_cols):
    
    print('cv', j)
    r, scores, pred_train, pred_test = val_predict_oof_nb(i, 10, params)
    preds_train[:, i] = pred_train
    preds_test[:, i] = pred_test
    cv_scores.append(np.mean(scores))
    print(np.mean(scores))
    
print(np.mean(cv_scores))

In [None]:
for i in range(20):
    train['pred_nb1_{}'.format(str(i))] = preds_train[:, i]
    test['pred_nb1_{}'.format(str(i))] = preds_test[:, i]
    
train.iloc[:, -20:].to_csv('predictions/pred_train_nb1.csv', index=False)
test.iloc[:, -20:].to_csv('predictions/pred_test_nb1.csv', index=False)