In [3]:
import numpy as np
import pandas as pd
from itertools import product
from sklearn.externals.joblib import load
from sklearn.metrics import roc_auc_score
from common import TARGETS

In [4]:
TRAIN = "data/train_clean.csv"

SUBMISSION_IN = {
    "gru_glove": "submissions/submission_gru_glove.csv.gz",
    "gru_fasttext": "submissions/submission_gru_fasttext.csv.gz",
    "cnn_glove": "submissions/submission_cnn_glove.csv.gz",
    "cnn_fasttext": "submissions/submission_cnn_fasttext.csv.gz",
    "lstmcnn_glove": "submissions/submission_lstm_cnn_glove.csv.gz",
    "lstmcnn_fasttext": "submissions/submission_lstm_cnn_fasttext.csv.gz",
    "logreg": "submissions/submission_logistic_regression.csv.gz",
    "xgb": "submissions/submission_xgb.csv.gz"
}

SUBMISSION_OUT = "submissions/submission_weighted_avg_all.csv.gz"

K_FOLDS = 10

VALIDATION_PRED_FILE = {
    "gru_glove": "cache/gru_glove_validation_pred_fold_%s.pkl",
    "gru_fasttext": "cache/gru_fasttext_validation_pred_fold_%s.pkl",
    "cnn_glove": "cache/cnn_glove_validation_pred_fold_%s.pkl",
    "cnn_fasttext": "cache/cnn_fasttext_validation_pred_fold_%s.pkl",
    "lstmcnn_glove": "cache/lstm_cnn_glove_validation_pred_fold_%s.pkl",
    "lstmcnn_fasttext": "cache/lstm_cnn_fasttext_validation_pred_fold_%s.pkl",
    "logreg": "cache/logreg_validation_pred_fold_%s.pkl",
    "xgb": "cache/xgb_validation_pred_fold_%s.pkl"
}

In [5]:
train = pd.read_csv(TRAIN, encoding="utf-8")
submission = pd.read_csv("data/sample_submission.csv", encoding="utf-8")

In [6]:
subs = {}
for model in ["gru_glove", "gru_fasttext", "cnn_glove", "cnn_fasttext", "lstmcnn_glove", "lstmcnn_fasttext", "logreg", "xgb"]:
    subs[model] = pd.read_csv(SUBMISSION_IN[model], encoding="utf-8")

In [7]:
Y = {}

for model in ["gru_glove", "gru_fasttext", "cnn_glove", "cnn_fasttext", "lstmcnn_glove", "lstmcnn_fasttext", "logreg", "xgb"]:
    for fold in range(0, K_FOLDS):
        Y_fold = load(VALIDATION_PRED_FILE[model] % fold)
        if model not in Y.keys():
            Y[model] = Y_fold
        else:
            Y[model] = np.append(Y[model], Y_fold, axis=0)
    print("Loaded model '%s'" % model)

Loaded model 'gru_glove'
Loaded model 'gru_fasttext'
Loaded model 'cnn_glove'
Loaded model 'cnn_fasttext'
Loaded model 'lstmcnn_glove'
Loaded model 'lstmcnn_fasttext'
Loaded model 'logreg'
Loaded model 'xgb'


In [8]:
Y_truth = train[TARGETS].values

In [10]:
%%time
best_weights = (0, 0, 0, 0, 0, 0, 0, 0)
best_score = 0
for w_set in product(np.arange(0.0, 0.5, 0.1), repeat=len(SUBMISSION_IN.keys())):
    if sum(w_set) != 1.0:
        continue
    
    score = roc_auc_score(
        Y_truth,
        (
            (
                Y["gru_glove"] * w_set[0] +
                Y["gru_fasttext"] * w_set[1] +
                Y["cnn_glove"] * w_set[2] +
                Y["cnn_fasttext"] * w_set[3] +
                Y["lstmcnn_glove"] * w_set[4] +
                Y["lstmcnn_fasttext"] * w_set[5] +
                Y["logreg"] * w_set[6] +
                Y["xgb"] * w_set[7]
            ) / len(w_set)
        )
    )
    if score > best_score:
        best_score = score
        best_weights = w_set
        print("Improving: weights %s - score %0.6f" % (", ".join(map(lambda x: "%0.2f" % x, w_set)), score))

Improving: weights 0.00, 0.00, 0.00, 0.00, 0.00, 0.20, 0.40, 0.40 - score 0.989671
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.00, 0.30, 0.30, 0.40 - score 0.989892
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.00, 0.30, 0.40, 0.30 - score 0.989985
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.00, 0.40, 0.30, 0.30 - score 0.990070
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.00, 0.40, 0.40, 0.20 - score 0.990088
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.10, 0.20, 0.30, 0.40 - score 0.990195
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.10, 0.20, 0.40, 0.30 - score 0.990285
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.10, 0.30, 0.30, 0.30 - score 0.990386
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.10, 0.30, 0.40, 0.20 - score 0.990407
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.10, 0.40, 0.30, 0.20 - score 0.990417
Improving: weights 0.00, 0.00, 0.00, 0.00, 0.20, 0.30, 0.30, 0.20 - score 0.990471
Improving: weights 0.00, 0.00, 0.00, 0.10, 0.10, 0.20, 0.30, 0.30 - score 0.990501
Impr

In [11]:
for target in TARGETS:
    submission[target] = \
        subs["gru_glove"][target] * best_weights[0] + \
        subs["gru_fasttext"][target] * best_weights[1] + \
        subs["cnn_glove"][target] * best_weights[2] + \
        subs["cnn_fasttext"][target] * best_weights[3] + \
        subs["lstmcnn_glove"][target] * best_weights[4] + \
        subs["lstmcnn_fasttext"][target] * best_weights[5] + \
        subs["logreg"][target] * best_weights[6] + \
        subs["xgb"][target] * best_weights[7]

In [12]:
submission.to_csv(SUBMISSION_OUT, index=False, encoding="utf-8", compression="gzip") # 0.9866