In [1]:
import pandas as pd
import numpy as np

def FeatureEngineering(trainpath,testpath):
    #### Load Data
    train = pd.read_csv(trainpath)
    test = pd.read_csv(testpath)

    ### 
    y = train['target'].values
    testid= test['id'].values
    
    ### Drop calc
    unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
    train = train.drop(unwanted, axis=1)  
    test = test.drop(unwanted, axis=1)

    train.drop(['id','target'],axis=1,inplace=True)
    test.drop(['id'],axis=1,inplace=True)


    
    trainX = np.array(train)
    testX = np.array(test)
    trainy = np.array(y)
    
    return trainX, trainy, testX

In [14]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler


class Clf4Stack_NNMLP(object):
    def __init__(self, model, max_iter=500, n_splits=5):
        self.n_splits = n_splits
        self.model = model
        self.max_iter = max_iter
        

    def fit_predict(self, trainX, trainy, testX):

        self.train4stackNNMLP = np.zeros(len(trainX))
        self.test4stackNNMLP = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=0)

        for i, (train_index,test_index) in enumerate(skf.split(trainX, trainy)):
            print("=====Round {0}/{1}=====".format(i+1,self.n_splits))
            X_train, X_test = trainX[train_index], trainX[test_index]
            y_train, y_test = trainy[train_index], trainy[test_index]
	    
            scaler = StandardScaler()
            # Fit only to the training data
            scaler.fit(X_train)

            # Now apply the transformations to the data:
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

            x1, x2, y1, y2 = train_test_split(X_train, y_train, test_size=0.25, random_state=99)
            
            self.model.fit(x1, y1)            
            
            y_pred = self.model.predict_proba(X_test)[:,1]
            self.train4stackNNMLP[test_index] = y_pred
            self.test4stackNNMLP += self.model.predict_proba(testX)[:,1]
        
        self.test4stackNNMLP /= self.n_splits
            
    def output(self,train_file_name='train4stackNNMLP.csv',
                    test_file_name='test4stackNNMLP.csv',
                    col_name='F4stack'):

        pd.DataFrame({col_name:self.train4stackNNMLP}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.test4stackNNMLP}).to_csv(test_file_name,index=False)


In [15]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(30,30,30),early_stopping=True)

In [6]:
trainpath = "train.csv"
testpath = "test.csv"

trainX, trainy, testX = FeatureEngineering(trainpath,testpath)

In [16]:
C4S = Clf4Stack_NNMLP(mlp, max_iter=500, n_splits=5)

C4S.fit_predict(trainX, trainy, testX)

C4S.output()

=====Round 1/5=====
=====Round 2/5=====
=====Round 3/5=====
=====Round 4/5=====
=====Round 5/5=====


In [17]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [21]:
gini_normalized(C4S.train4stackNNMLP,trainy)


0.024868745303800479

In [None]:
#gini_normalized(C4S.test4stackNNMLP,y_test)