In [1]:
%matplotlib inline
import data
import numpy as np
import matplotlib.pyplot as plt
from classifiers import GRL_classifier, build_sequential_model
import stats
from keras.utils import np_utils

Using Theano backend.


In [2]:
data_folder = "./"
X, y, _, _, _ = data.load(data_folder + "training.csv", shuffle=True)
Xa, ya, wa, _, _ = data.load(data_folder + "check_agreement.csv", shuffle=False, weight=True)
Xa_train, ya_train, _, _, _ = data.load(data_folder + "check_agreement.csv", shuffle=True)
Xc, _, _, mc, _ = data.load(data_folder + "check_correlation.csv", shuffle=False, mass=True, test=True)

X, scaler = data.preprocess_data(X)
Xa, _ = data.preprocess_data(Xa, scaler)
Xa_train, _ = data.preprocess_data(Xa_train, scaler)
Xc, _ = data.preprocess_data(Xc, scaler)

y_categorical = np_utils.to_categorical(y)
ya_categorical = np_utils.to_categorical(ya_train)

X_test, _, _, _, ids = data.load("test.csv", test=True, ids=True)
X_test, _ = data.preprocess_data(X_test, scaler)

In [3]:
from keras.layers import PReLU, Dropout, Dense

def feature_extractor(input_size, output_size):
    return build_sequential_model([Dense(150, input_dim=input_size),
                                   PReLU(), Dropout(0.5), Dense(140),
                                   PReLU(), Dropout(0.45), Dense(120),
                                   PReLU(), Dropout(0.42), Dense(110),
                                   PReLU(), Dropout(0.4), Dense(output_size)],
                                  name="feature_extractor")

def label_classifier(input_size, name="label_classifier"):
    return build_sequential_model([Dense(90, input_dim=input_size),
                                  PReLU(), Dropout(0.4), Dense(80),
                                  PReLU(), Dropout(0.35), Dense(70),
                                  PReLU(), Dropout(0.3), Dense(2, activation='softmax')],
                                  name=name)

In [8]:
n_epochs = 130
np.random.seed(42)
n_extracted_features = 120
transfering_ratio = 0.5
steps = 50
lambda_low = 0.1
lambda_high = 0.5

ks_plot, cvm_plot, auc_plot = [], [], []
plots = [ks_plot, cvm_plot, auc_plot]

In [9]:
# Learning on train
f = feature_extractor(X.shape[1], n_extracted_features)
l = label_classifier(n_extracted_features)
d = label_classifier(n_extracted_features, name="domain_classifier")
model = GRL_classifier(f, l, d, 0)

metrics_callback = stats.ShowMetrics(model, Xa, ya, wa, Xc, mc, X, y_categorical, verbose=True)

model.fit(X, y_categorical, y_categorical,
          epoch_count=int((1 - transfering_ratio) * n_epochs),
          batch_size=128, validation_split=0.05, verbose=2,
          callbacks=[metrics_callback])

for old_plot, new_plot in zip(plots, metrics_callback.get_history()):
    old_plot += new_plot

Train on 64175 samples, validate on 3378 samples
Epoch 1/65
KS: 0.1345381708292842 : 0.09 / CvM: 0.0010403976904824895 : 0.002 / AUC: 0.9896980205443424
37s - loss: 0.8414 - label_classifier_acc: 0.8157 - domain_classifier_acc: 0.8111 - val_loss: 0.6694 - val_label_classifier_acc: 0.8777 - val_domain_classifier_acc: 0.8763
Epoch 2/65
KS: 0.1505369911964719 : 0.09 / CvM: 0.0011039469842898128 : 0.002 / AUC: 0.9913072574083023
32s - loss: 0.6615 - label_classifier_acc: 0.8711 - domain_classifier_acc: 0.8706 - val_loss: 0.6809 - val_label_classifier_acc: 0.8828 - val_domain_classifier_acc: 0.8825
Epoch 3/65
KS: 0.17392787792453773 : 0.09 / CvM: 0.0011031902847038948 : 0.002 / AUC: 0.9918889827150537
31s - loss: 0.6351 - label_classifier_acc: 0.8753 - domain_classifier_acc: 0.8753 - val_loss: 0.6368 - val_label_classifier_acc: 0.8804 - val_domain_classifier_acc: 0.8810
Epoch 4/65
KS: 0.1821359711133116 : 0.09 / CvM: 0.001088637091324756 : 0.002 / AUC: 0.9923215642877179
31s - loss: 0.6178 

In [None]:
# Transfering to check_agreement
ya_output = model.predict(np.array(Xa_train))
for step in range(steps):
    lam = np.linspace(lambda_low, lambda_high, steps)[step]
    print('lambda = ', lam)
    model.lam = lam
    metrics_callback = stats.ShowMetrics(model, Xa, ya, wa, Xc, mc, X, y_categorical, verbose=True)
    model.fit(np.vstack((Xa_train, X)), np.vstack((ya_output, y_categorical)),
              np.vstack((ya_categorical, y_categorical)),
              epoch_count=int(transfering_ratio * n_epochs / steps),
              batch_size=512, validation_split=0.25, verbose=2,
              callbacks=[metrics_callback])
    for old_plot, new_plot in zip(plots, metrics_callback.get_history()):
        old_plot += new_plot
    p = model.predict_probs(np.array(X_test))
    data.save_submission(ids, p, "grl_prediction_{}__.csv".format(lam))

In [None]:
plt.plot(cvm_plot, label='CvM')
plt.show()
plt.plot(ks_plot, label='KS')
plt.show()
plt.plot(auc_plot, label='AUC')
plt.show()

In [None]:
probs = model.predict_probs(np.array(X_test))
data.save_submission(ids, probs, "grl_prediction.csv")
!zip -9 -r grl_prediciton.csv.zip grl_prediction.csv