In [1]:
%matplotlib inline
import data
import numpy as np
import matplotlib.pyplot as plt
from classifiers import GRL_classifier, build_sequential_model
import stats

Using Theano backend.


In [2]:
data_folder = "./"
X, y, _, _, _ = data.load(data_folder + "training.csv",
                          shuffle=True)
Xa, ya, wa, _, _ = data.load(data_folder + "check_agreement.csv",
                             shuffle=False, weight=True)
Xa_train, ya_train, _, _, _ = data.load(data_folder + "check_agreement.csv",
                                        shuffle=True)
Xc, _, _, mc, _ = data.load(data_folder + "check_correlation.csv",
                            shuffle=False, mass=True, test=True)
X, scaler = data.preprocess_data(X)
Xa, _ = data.preprocess_data(Xa, scaler)
Xa_train, _ = data.preprocess_data(Xa_train, scaler)
Xc, _ = data.preprocess_data(Xc, scaler)
X_test, _, _, _, ids = data.load("test.csv", test=True,
                                 ids=True)
X_test, _ = data.preprocess_data(X_test, scaler)

In [None]:
from keras.layers import PReLU, Dropout, Dense

def feature_extractor(input_size, output_size):
    return build_sequential_model([Dense(150, input_dim=input_size),
                                   PReLU(), Dropout(0.5), Dense(140),
                                   PReLU(), Dropout(0.45), Dense(120),
                                   PReLU(), Dropout(0.42), Dense(110),
                                   PReLU(), Dropout(0.4), Dense(output_size)],
                                  name="feature_extractor")


def label_classifier(input_size, name="label_classifier"):
    return build_sequential_model([Dense(90, input_dim=input_size),
                                  PReLU(), Dropout(0.4), Dense(80),
                                  PReLU(), Dropout(0.35), Dense(70),
                                  PReLU(), Dropout(0.3), Dense(2, activation='softmax')],
                                  name=name)

In [None]:
n_epochs = 300
np.random.seed(42)  # repeatability

from keras.utils import np_utils
y_categorical = np_utils.to_categorical(y)
ya_categorical = np_utils.to_categorical(ya_train)

n_extracted_features = 100
f = feature_extractor(X.shape[1], n_extracted_features)
l = label_classifier(n_extracted_features)
d = label_classifier(n_extracted_features, name="domain_classifier")

transfering_ratio = 0.70
# Learning on train

ks_plot, cvm_plot, auc_plot = [], [], []

model = GRL_classifier(f, l, d, 0)
metrics_callback = stats.ShowMetrics(model, Xa, ya, wa, Xc, mc, X, y_categorical)
model.fit(X, y_categorical, y_categorical,
          epoch_count=int((1 - transfering_ratio) * n_epochs),
          batch_size=128, validation_split=0.05, verbose=2,
          callbacks=[metrics_callback])
ks_plot += metrics_callback.history_ks
cvm_plot += metrics_callback.history_cvm
auc_plot += metrics_callback.history_auc
# Transfering to check_agreement
ya_output = model.predict(np.array(Xa_train))
steps = 90
for step in range(steps):
    lam = np.linspace(0, 0.3, steps)[step]
    print('lambda = ', lam)
    model.lam = lam
    metrics_callback = stats.ShowMetrics(model, Xa, ya, wa, Xc, mc, X, y_categorical)
    model.fit(np.vstack((Xa_train, X)), np.vstack((ya_output, y_categorical)),
              np.vstack((ya_categorical, y_categorical)),
              epoch_count=int(transfering_ratio * n_epochs / steps),
              batch_size=512, validation_split=0.25, verbose=2,
              callbacks=[metrics_callback])
    cvm_plot += metrics_callback.history_cvm
    ks_plot += metrics_callback.history_ks
    auc_plot += metrics_callback.history_auc
    p = model.predict_probs(np.array(X_test))
    data.save_submission(ids, p, "grl_prediction_{}__.csv".format(lam))

plt.plot(cvm_plot, label='CvM')
plt.show()
plt.plot(ks_plot, label='KS')
plt.show()
plt.plot(auc_plot, label='AUC')
plt.show()
# Output
probs = model.predict_probs(np.array(X_test))

Train on 64175 samples, validate on 3378 samples
Epoch 1/90
33s - loss: 0.8411 - label_classifier_acc: 0.8144 - domain_classifier_acc: 0.8110 - val_loss: 0.6139 - val_label_classifier_acc: 0.8801 - val_domain_classifier_acc: 0.8807
Epoch 2/90
31s - loss: 0.6655 - label_classifier_acc: 0.8712 - domain_classifier_acc: 0.8711 - val_loss: 0.6516 - val_label_classifier_acc: 0.8840 - val_domain_classifier_acc: 0.8845


In [None]:
data.save_submission(ids, probs, "grl_prediction.csv")
!zip -9 -r grl_prediciton.csv.zip grl_prediction.csv