In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import lightgbm as lgb

In [2]:
# read data
train_data = pd.read_csv('../data/full_size/atec_anti_fraud_train.csv',index_col=0)
train_data = train_data[train_data['label']!=-1]
X = train_data.iloc[:,1:]
Y = train_data.iloc[:,0]

In [3]:
# load models
model_root='../models/'
model_names=['xgb_300n_5d_0.1l.pkl','lgb_default_early_stop.txt']
proba_matrix = {}
for model_name in model_names:
    model_path = model_root+model_name
    print('loading model {}'.format(model_name))
    if model_name.endswith('pkl'):
        clf = joblib.load(model_path)
        # predict over whole train dataset
        print('{} is predicting....'.format(model_name))
        y_predict_proba = clf.predict_proba(X)
        print("appending {}'s results...".format(model_name))
        proba_matrix[model_name] = y_predict_proba[:,1]
    elif model_name.endswith('txt'):
        clf = lgb.Booster(model_file=model_path)
        y_predict_proba = clf.predict(X)
        print("appending {}'s results...".format(model_name))
        proba_matrix[model_name] = y_predict_proba

# build proba_matrix dataframe
proba_matrix = pd.DataFrame(proba_matrix,index = X.index)
proba_matrix['truth'] = Y

loading model xgb_300n_5d_0.1l.pkl
xgb_300n_5d_0.1l.pkl is predicting....
appending xgb_300n_5d_0.1l.pkl's results...
loading model lgb_default_early_stop.txt
appending lgb_default_early_stop.txt's results...


In [4]:
proba_matrix.to_csv('ensemble.csv',index=False)

In [71]:
# build neural network graph
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.constraints import non_neg
from keras import regularizers
from keras import initializers

def ensembleNet(input_dim):
    model = Sequential([Dense(1,input_dim=input_dim,
                              kernel_initializer=initializers.RandomNormal(mean=0.5, stddev=0.05, seed=None),
                              kernel_constraint=non_neg(),bias_constraint=non_neg(),kernel_regularizer=regularizers.l2(0.01))])
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [72]:
# training ensemble model
train_ratio = 0.8
train_num = int(proba_matrix.shape[0]*train_ratio)
ensemble_net = ensembleNet(len(model_names))
ensemble_net.fit(x=proba_matrix.iloc[:train_num,:2].values,y=proba_matrix.iloc[:train_num,2].values,epochs=1)

Epoch 1/1


<keras.callbacks.History at 0x16c1c333b38>

In [90]:
from sklearn import metrics
y_predict_proba = ensemble_net.predict(proba_matrix.iloc[train_num:,:2].values)
length= y_predict_proba.shape[0]
# reshape output
y_predict_proba = y_predict_proba.reshape(length,)
def polorize(x):
    if x>0.1:
        return 1
    else:
        return 0
vec = np.vectorize(polorize)
y_predict = vec(y_predict_proba)
print("precision:{},recall:{}".format(metrics.precision_score(Y[train_num:],y_predict),metrics.recall_score(Y[train_num:],y_predict)))

precision:0.4961315280464217,recall:0.84375


In [96]:
pd.DataFrame({'score':y_predict_proba,'truth':Y[train_num:]},index=Y[train_num:].index).to_csv('ensem.csv')
# save ensemble model
ensemble_net.save('../models/ensemble_xgb300n5d0.1l+lgb_default_early_stop.h5')

In [95]:
# generate final output for official test data
test_data = pd.read_csv('../data/full_size/atec_anti_fraud_test_a.csv',index_col = 0)
X = test_data

# load model
model_root='../models/'
model_names=['xgb_300n_5d_0.1l.pkl','lgb_default_early_stop.txt']
proba_matrix = {}
for model_name in model_names:
    model_path = model_root+model_name
    print('loading model {}'.format(model_name))
    if model_name.endswith('pkl'):
        clf = joblib.load(model_path)
        # predict over whole train dataset
        print('{} is predicting....'.format(model_name))
        y_predict_proba = clf.predict_proba(X)
        print("appending {}'s results...".format(model_name))
        proba_matrix[model_name] = y_predict_proba[:,1]
    elif model_name.endswith('txt'):
        clf = lgb.Booster(model_file=model_path)
        y_predict_proba = clf.predict(X)
        print("appending {}'s results...".format(model_name))
        proba_matrix[model_name] = y_predict_proba

# generate result
proba_matrix = pd.DataFrame(proba_matrix,index = X.index)
y_predict_proba = ensemble_net.predict(proba_matrix.values)
length= y_predict_proba.shape[0]
scores = y_predict_proba.reshape(length,)
result = pd.DataFrame({'score':scores},index=test_data.index)
result.to_csv('../submission/ensemble_xgb300n5d0.1l+lgb_default_early_stop.csv')