In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.preprocessing import Imputer, LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [2]:
target = '标签'
uid = '申请编号'

def calc_auc(y_test, y_proba):
    auc = round(metrics.roc_auc_score(y_test, y_proba), 3)
    return auc

def ks_score(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return KS

In [3]:
############## Load

# For kfold
n_folds = 5

# ### Test
# X = pd.read_csv('./tmp/train_d1234.csv', header=0, index_col=0).head(10000)
# X.shape

# y = pd.read_csv('./data/train_label.csv', header=0, index_col=0).head(10000)
# y.shape

''' Load '''
X = pd.read_csv('./tmp/train_d1234.csv', header=0, index_col=0)
X.shape

y = pd.read_csv('./data/train_label.csv', header=0, index_col=0)
y.shape

''' Merge '''
# Merge
xy = pd.merge(X, y, on=uid, how='inner')
xy.drop(uid, axis=1, inplace=True)
xy.shape

''' Split '''
# X, y
X = xy.copy()
y = X.pop(target)
X.shape
y.shape

# ### List
# X = pd.read_csv('./tmp/train_d12_d3_dum.csv', header=0, index_col=0).drop(uid, axis=1).values
# len(X)
# len(X[0])

# y = pd.read_csv('./tmp/1_y.csv', header=0, index_col=0)['0'].values
# len(y)

' Load '

(140000, 405)

(140000, 1)

' Merge '

(140000, 405)

' Split '

(140000, 404)

(140000,)

In [4]:
# RF
param_fixed_rf = {
    'n_jobs' : -1,
    'oob_score' : True,
    'random_state':123,
    'verbose':0
}

# XGB
param_fixed_xgb = {
    'n_jobs' : -1,
    'eval_metric': 'auc',
    'seed' : 123,
    'silent' : 1,
    'verbose_eval':0
}

# LGB
param_fixed_lgb = {
    'n_jobs' : -1,
    'metric' : 'auc',
    'random_state' : 123,
    'bagging_seed':123,
    'feature_fraction_seed':123,
    'verbose_eval' : 0
}

# LR
param_fixed_lr = {
    'n_jobs' : -1,
    'random_state' : 123,
    'verbose' : 0     
}

In [None]:
class StackingModels (BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models, base_feats, meta_model, n_folds = n_folds):
        self.base_models = base_models
        self.base_feats = base_feats
        self.meta_model = meta_model
        self.n_folds = n_folds
        self.meta_X = []

    def fit(self, X_ttl, y):
        self.base_models_ = [list() for x in self.base_models]
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=123)
        
        # Get results of basic models
        out_of_fold_predictions = np.zeros((X_ttl.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            l = len(self.base_feats[i]) # Feature Count
            print(f'*** Base Model: {model.__class__}, Features: {l} ***')
            j = 0
            X = X_ttl[self.base_feats[i]]
            for train_index, valid_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X.iloc[train_index],  y.iloc[train_index])
                y_pred = instance.predict_proba(X.iloc[valid_index])[:,1]
                ks = ks_score(y.iloc[valid_index], y_pred)
                print(f'* KS({j}):{ks} *')
                out_of_fold_predictions[valid_index, i] = y_pred
                j += 1

        # META
        self.meta_X = out_of_fold_predictions
        self.meta_model_ = clone(self.meta_model)
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self

    # Prediect
    def predict(self, X_ttl, if_meta=True):
        
        # Features for Meta Model       
        meta_features = pd.DataFrame()
        for i, models_kfold in enumerate(self.base_models_):
            X = X_ttl[self.base_feats[i]] 
            prob_kfold = pd.DataFrame()
            for j, model in enumerate(models_kfold):
                prob_kfold[j] = model.predict_proba(X)[:,1]
            meta_features[i] = prob_kfold.mean(axis=1)
        
        # Prediect
        if if_meta:
            pred = self.meta_model_.predict_proba(meta_features)[:,1]
        else:
            pred = meta_features.mean(axis=1)
        return pred
    
############ Stacking

### Base models

# Base Featues
feats = np.load('./model/base_features.npy', allow_pickle=True).item()

# RF = RandomForestClassifier()
# XGB = XGBClassifier()
# LGB = LGBMClassifier()
# LR = LogisticRegression()

# RF
best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_rf}
RF = RandomForestClassifier(**model_params)
# XGB
best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_xgb}
XGB = XGBClassifier(**model_params)
# LGB
best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lgb}
LGB = LGBMClassifier(**model_params)
# LR
best_params_load = np.load('./model/base_lr.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lr}
LR = LogisticRegression(**model_params)
# Mean
MEAN = 'MEAN'

### Stacking
base_models = [LGB, RF, XGB]
base_feats = [feats['lgb'], feats['rf'], feats['xgb']]
meta_model = LR
CLF = StackingModels(base_models=base_models, base_feats=base_feats, meta_model=meta_model)

# Fit
CLF.fit(X, y)

# Predict
pred = CLF.predict(X)
ks_score(y, pred)

# Save meta_X
meta_X = pd.DataFrame(CLF.meta_X, columns=['lgb', 'rf', 'xgb'])
meta_X.shape
meta_X.head()
meta_X.to_csv('./model/x_meta.csv')

*** Base Model: <class 'lightgbm.sklearn.LGBMClassifier'>, Features: 242 ***
* KS(0):0.3675 *
* KS(1):0.3679 *
* KS(2):0.3538 *


In [6]:
############## Test

def norm_score(score):
    if score < 0:
        score = 0
    elif score > 1:
        score = 1
    return score

test = pd.read_csv('./tmp/A_d1234.csv', header=0, index_col=0)
X_test = test.drop(uid, axis=1)
X_test.shape

# Predict
scores = CLF.predict(X_test)

# Output
uids = test[uid]
out = pd.concat([uids, pd.Series(scores)], axis=1)
out[0] = out[0].apply(norm_score)
out.head()
out.to_csv('./model/predict.csv', header=False, index=False)

(21511, 404)

Unnamed: 0,申请编号,0
0,122687,0.052535
1,32425,0.078397
2,2024,0.185654
3,25019,0.186451
4,162532,0.125395


In [None]:
############## Ana

base_models = [LGB, RF, XGB]
base_models_ = [list() for x in base_models]
kfold = KFold(n_splits=2, shuffle=True, random_state=123)

out_of_fold_predictions = np.zeros((X.shape[0], len(base_models)))
for i, model in enumerate(base_models):
    j = 0
    for train_index, valid_index in kfold.split(X, y):
        instance = clone(model)
        base_models_[i].append(instance)
        instance.fit(X.loc[train_index],  y.loc[train_index])
        len(X.loc[train_index])
        y_pred = instance.predict_proba(X.loc[valid_index])[:,1]
        ks = ks_score(y.loc[valid_index], y_pred)
        print(f'* KS({j}):{ks} *')
        out_of_fold_predictions[valid_index, i] = y_pred
        j += 1

In [None]:
len(out_of_fold_predictions)
len(out_of_fold_predictions[0])
out_of_fold_predictions[0]

In [None]:
meta_model = LR
meta_model_ = clone(meta_model)

meta_model_.fit(out_of_fold_predictions, y)
y_pred = meta_model_.predict_proba(out_of_fold_predictions)[:,1]
ks_score(y, y_pred)

In [None]:
meta_features = np.column_stack ([
    np.column_stack(
        [model.predict_proba(X)[:,1] for model in base_models]
    ).mean (axis=1)
    for base_models in base_models_])

In [None]:
len(meta_features)
len(meta_features[0])
meta_features[0]

In [None]:
base_models_

In [None]:
meta_features = pd.DataFrame()
for i, models_kfold in enumerate(base_models_):
    i
    prob_kfold = pd.DataFrame()
    for j, model in enumerate(models_kfold):
        j
        prob_kfold[j] = model.predict_proba(X)[:,1]
        prob_kfold.shape
    prob_kfold.head()
    meta_features[i] = prob_kfold.mean(axis=1)

In [None]:
meta_features.shape
meta_features.head()

In [None]:
''' RF '''
RF.fit(meta_features, y)
ks_score(y, RF.predict_proba(meta_features)[:,1])

''' RF1 '''
RF1 = RandomForestClassifier()
RF1.fit(meta_features, y)
ks_score(y, RF1.predict_proba(meta_features)[:,1])

''' LGB '''
LGB.fit(meta_features, y)
ks_score(y, LGB.predict_proba(meta_features)[:,1])

''' LR '''
LR.fit(meta_features, y)
ks_score(y, LR.predict_proba(meta_features)[:,1])

''' MEAN '''
# LR.fit(meta_features, y)
ks_score(y, meta_features.mean(axis=1))