In [49]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier

# Critere de performance
def compute_pred_score(y_true, y_pred):
    y_pred_unq = np.unique(y_pred)
    for i in y_pred_unq:
        if (i != -1) & (i!= 1) & (i!= 0):
            raise ValueError('The predictions can contain only -1, 1, or 0!')
    y_comp = y_true * y_pred
    score = float(10*np.sum(y_comp == -1) + np.sum(y_comp == 0))
    score /= y_comp.shape[0]
    return score

X_train_fname = 'training_templates.csv'
y_train_fname = 'training_labels.txt'
X_test_fname = 'testing_templates.csv'
X_train = pd.read_csv(X_train_fname, sep=',', header=None).values
X_test = pd.read_csv(X_test_fname,  sep=',', header=None).values
y_train = np.loadtxt(y_train_fname, dtype=np.int)


def uncerAjust(y_pred, y_pred_pro, threshold=0.9):
    temps = y_pred
    for i in xrange(len(y_pred)):
        if np.abs(y_pred_pro[i][1]-y_pred_pro[i][0]) < threshold:
            temps[i] = 0
    return temps


def scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    y_pred_pro = estimator.predict_proba(X)
    y_pred = uncerAjust(y_pred, y_pred_pro)
    return 1-compute_pred_score(y, y_pred)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_train, y_train,
                                                            train_size=0.98, test_size=0.02, random_state=42)

pca = PCA(svd_solver='randomized', n_components=128, whiten=True)
X_train_pca = pca.fit_transform(X_train_1)
X_test_pca = pca.transform(X_test_1)
print 1

clf = BaggingClassifier(MLPClassifier(hidden_layer_sizes=(200, 200, 200)),
                        n_estimators=30, max_features=0.7, max_samples=0.5, n_jobs=-1)
clf.fit(X_train_pca, y_train_1)

1


BaggingClassifier(base_estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(150, 150, 150, 150), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=0.7,
         max_samples=0.5, n_estimators=20, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [50]:
threshold = 0

X_train_pca = pca.fit_transform(X_train_1)
X_test_pca = pca.transform(X_test_1)
y_pred = clf.predict(X_test_pca)
y_pred_pro = clf.predict_proba(X_test_pca)

score = 1
for thre in np.arange(0.2, 0.8, 0.01):
    y_pred_111 = uncerAjust(y_pred, y_pred_pro, thre)
    temp = compute_pred_score(y_pred_111, y_test_1)
    if score > temp:
        threshold = thre
        score = temp
        print temp
print 'the threshold is: %f' %threshold

X_test = pd.read_csv(X_test_fname,  sep=',', header=None).values
X_test_pca1 = pca.transform(X_test)
y_pred_test = clf.predict(X_test_pca1)
y_pred_test_pro = clf.predict_proba(X_test_pca1)
print y_pred_test_pro
y_pred_test = uncerAjust(y_pred_test, y_pred_test_pro, threshold)
np.savetxt('y_pred_bagging.txt', y_pred_test, fmt='%d')

0.125
0.121685606061
0.113636363636
0.105587121212
0.101325757576
0.0984848484848
0.0970643939394
the threshold is: 0.360000
[[  4.13273213e-02   9.58672679e-01]
 [  9.99997373e-01   2.62732895e-06]
 [  4.75954028e-03   9.95240460e-01]
 ..., 
 [  4.09508130e-02   9.59049187e-01]
 [  7.51859942e-03   9.92481401e-01]
 [  5.13714623e-02   9.48628538e-01]]
