In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
import xgboost as xgb

from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.regularizers import *
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras import optimizers
import keras

import preprocessing

import theano.tensor as T

Using TensorFlow backend.


In [2]:
train_set, test_set = preprocessing.main(dropID=False, threshold=0.5, standardize=True)

Removing duplicate columns
Replacing missing var3 country value with mean
Adding a feature for the sum of zeros
Log transforming var38 and splitting var38 into two features
Removing features with variance less than 0.500
the features removed were ['ind_var1_0', 'ind_var1', 'ind_var2_0', 'ind_var5_0', 'ind_var5', 'ind_var6_0', 'ind_var6', 'ind_var8_0', 'ind_var8', 'ind_var12_0', 'ind_var12', 'ind_var13_0', 'ind_var13_corto_0', 'ind_var13_corto', 'ind_var13_largo_0', 'ind_var13_largo', 'ind_var13_medio_0', 'ind_var13', 'ind_var14_0', 'ind_var14', 'ind_var17_0', 'ind_var17', 'ind_var18_0', 'ind_var19', 'ind_var20_0', 'ind_var20', 'ind_var24_0', 'ind_var24', 'ind_var25_cte', 'ind_var26_0', 'ind_var26_cte', 'ind_var25_0', 'ind_var30_0', 'ind_var30', 'ind_var31_0', 'ind_var31', 'ind_var32_cte', 'ind_var32_0', 'ind_var33_0', 'ind_var33', 'ind_var34_0', 'ind_var37_cte', 'ind_var37_0', 'ind_var39_0', 'ind_var40_0', 'ind_var40', 'ind_var41_0', 'ind_var44_0', 'ind_var44', 'num_var1_0', 'num_var1'

In [3]:
train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

In [4]:
X_train = train_set.drop('TARGET', axis=1)
y_train = train_set['TARGET']

X_test = test_set

In [5]:
model = xgb.XGBClassifier(
    # Tree params
    gamma=0,
    max_depth=5,
    min_child_weight=9,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,
    
    objective= 'binary:logistic',
    seed=5
)

def xgbParamSearch(X_train, y_train, X_test):
    cv_params = {'max_depth': [3,4], 'learning_rate': [0.1], 'min_child_weight' : [5, 7], 
                 'subsample' : [0.8, 0.9], 'colsample_bytree' : [0.8, 0.9]}
    GBM = GridSearchCV(model, cv_params, scoring = 'accuracy', cv = 2, verbose=True)
    GBM.fit(X_train, y_train)
    
    y_pred = GBM.predict_proba(X_test)[:,1]
    y_pred_train = GBM.predict_proba(X_train)[:,1]
    
    fpr, tpr, threshold = metrics.roc_curve(y_train, y_pred_train)
    print('xgb: %d', metrics.auc(fpr, tpr))
    
    model.set_params(max_depth=GBM.best_params_['max_depth'])
    model.set_params(learning_rate=GBM.best_params_['learning_rate'])
    model.set_params(max_depth=GBM.best_params_['min_child_weight'])
    model.set_params(learning_rate=GBM.best_params_['subsample'])
    model.set_params(learning_rate=GBM.best_params_['learning_rate'])
    
    return y_pred
    

def xgbEarlyStopping(X_train, y_train, X_test):
    xgb_param = model.get_xgb_params()
    dtrain = xgb.DMatrix(X_train.values, label=y_train.values, missing=np.nan)
    cv_result = xgb.cv(
        xgb_param,
        dtrain,
        num_boost_round=model.get_params()['n_estimators'],
        nfold=5,
        metrics=['auc'],
        early_stopping_rounds=50,
        verbose_eval=10)
    best_n_estimators = cv_result.shape[0]
    model.set_params(n_estimators=best_n_estimators)

    model.fit(X_train, y_train, eval_metric='auc')
    feat_imp = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)
    index = pd.Series(model.booster().get_fscore()).sort_values(ascending=False).index
    print((index, feat_imp))
    
    y_pred = model.predict_proba(X_test)[:,1]
    y_pred_train = model.predict_proba(X_train)[:,1]
    
    fpr, tpr, threshold = metrics.roc_curve(y_train, y_pred_train)
    print('xgb: %d', metrics.auc(fpr, tpr))
    
    features = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)
    index = pd.Series(model.booster().get_fscore()).sort_values(ascending=False).index
    
    '''
    pos = np.arange(len(index[:15]))
    width = 0.8     # gives histogram aspect to the bar diagram

    ax = plt.axes()
    ax.set_yticks(pos + (width / 2))
    ax.set_yticklabels(index[:15])

    ax.set_title("Distribution of F-scores")
    ax.set_xlabel("relative importance")

    plt.barh(pos[:15], features[:15], width, color='royalblue')
    plt.show()
    '''

    submission = pd.DataFrame({"index":index, "features":features})
    submission.to_csv("scores.csv", index=False)
    
    return y_pred
    


In [6]:
from keras.optimizers import SGD
from sklearn.cross_validation import train_test_split

def keras_NN(Xtrain, ytrain, Xtest):
    
    Xtrain = StandardScaler().fit_transform(Xtrain)
    Xtest = StandardScaler().fit_transform(Xtest)
    
    train_X, test_X, train_y, test_y = train_test_split(Xtrain, ytrain, train_size=0.9, random_state=0)
    '''
    models = Sequential()

    models.add(Dense(120, input_shape=(Xtrain.shape[1],), init='uniform', W_regularizer=l2(0.00001)))
    models.add(PReLU())
    models.add(BatchNormalization(mode=2))
    models.add(Dropout(0.6))
    models.add(Dense(1, init='uniform'))
    models.add(Activation('softmax'))

    opt = optimizers.Adagrad(lr=0.0125)
    models.compile(loss='binary_crossentropy', optimizer=opt) 
    models.fit(train_X, train_y, class_weight={0:0.0396, 1:0.9604})
    models.evaluate(test_X, test_y)
    
    
    y_pred = models.predict_proba(Xtest)
    '''
    model = Sequential()
    model.add(Dense(input_dim=Xtrain.shape[1],
                    output_dim=400,
                    init='uniform',
                    activation='tanh'))

    model.add(Dense(input_dim=400,
                    output_dim=1,
                    init='uniform',
                    activation='sigmoid'))
    
    opt = optimizers.Adagrad(lr=0.0125)
    model.compile(loss='binary_crossentropy', optimizer=opt)

    model.fit(train_X, train_y, nb_epoch=10, class_weight={0:0.0396, 1:0.9604})
    model.evaluate(test_X, test_y)

    y_pred = model.predict_proba(Xtest)
    y_pred = [item for sublist in y_pred for item in sublist]
    
    return y_pred

In [7]:
from sklearn.ensemble import BaggingClassifier

classifier = BaggingClassifier(model, n_estimators=10)
classifier.fit(X_train, y_train)
y_pred = classifier.predict_proba(X_test)[:, 1]

KeyboardInterrupt: 

In [9]:
y_pred= xgbParamSearch(X_train, y_train, X_test)
#y_pred = xgbEarlyStopping(X_train, y_train, X_test)
#y_pred = keras_NN(X_train.as_matrix(), y_train.as_matrix(), X_test.as_matrix())
print(y_pred)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:  3.7min finished


xgb: %d 0.854269686231
[ 0.06043969  0.0559074   0.00210035 ...,  0.00370104  0.06048683
  0.0014752 ]


In [10]:
X_test = pd.read_csv('test.csv')

submission = pd.DataFrame({"ID":X_test['ID'], "TARGET":y_pred})
submission.to_csv("submission.csv", index=False)

In [None]:
X_test.describe()

In [None]:
blended_train_set = np.vstack((xgb_train, xgb_es_train, rf_train))
blended_test_set = np.vstack((xgb_test, xgb_es_test, rf_test))

blended_train_df = pd.DataFrame(blended_train_set.T, columns=['xgb-grid', 'xgb-es', 'rand-f'])
blended_test_df = pd.DataFrame(blended_test_set.T, columns=['xgb-grid', 'xgb-es', 'rand-f'])

y_pred_train, y_pred_test = logistic_regression(blended_train_df, y_train, blended_test_df)