In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
%matplotlib inline
import warnings;warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats as spstats
from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone
from sklearn.preprocessing import Imputer, LabelEncoder, PolynomialFeatures, StandardScaler, scale as skscale
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



In [2]:
target = '标签'
uid = '申请编号'

def calc_auc(y_test, y_proba):
    auc = round(metrics.roc_auc_score(y_test, y_proba), 3)
    return auc

def ks_score(y_test, y_proba):
    scale = 4
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba, pos_label=1)
    KS = round(max(list(tpr-fpr)), scale)
    return KS

In [3]:
############## Load

# For kfold
n_folds = 5

''' Load '''
Xid = pd.read_csv('./tmp/train_d1234_nona.csv', header=0, index_col=0)#.head(10000)
Xid.shape

yid = pd.read_csv('./data/train_label.csv', header=0, index_col=0)#.head(10000)
yid.shape

''' Merge '''
# Merge
xy = pd.merge(Xid, yid, on=uid, how='inner')

# ''' Filter Bins ''' 
# uids = pd.read_csv('./tmp/train_uid0.csv', header=0, index_col=0)
# uids.columns = [uid]
# xy = pd.merge(uids, xy, on=uid, how='left')
# xy.shape

''' drop id'''
xy.drop(uid, axis=1, inplace=True)
xy.shape

''' Split '''
# X, y
X = xy.copy()
y = X.pop(target)
X.shape
y.shape

''' *** With na *** '''

''' Load '''
Xid1 = pd.read_csv('./tmp/train_d1234_na.csv', header=0, index_col=0)#.head(10000)
Xid1.shape

''' Merge '''
xy1 = pd.merge(Xid1, yid, on=uid, how='inner')
xy1.drop(uid, axis=1, inplace=True)
xy1.shape

''' Split '''
# X, y
X1 = xy1.copy()
y1 = X1.pop(target)
X1.shape
y1.shape

' Load '

(140000, 812)

(140000, 1)

' Merge '

' drop id'

(140000, 812)

' Split '

(140000, 811)

(140000,)

' *** With na *** '

' Load '

(140000, 812)

' Merge '

(140000, 812)

' Split '

(140000, 811)

(140000,)

In [4]:
# RF
param_fixed_rf = {
    'n_jobs' : -1,
    'oob_score' : True,
    'random_state':123,
    'verbose':0
}

# XGB
param_fixed_xgb = {
    'n_jobs' : -1,
    'eval_metric': 'auc',
    'seed' : 123,
    'silent' : 1,
    'verbose_eval':0
}

# LGB
param_fixed_lgb = {
    'n_jobs' : -1,
    'metric' : 'auc',
    'random_state' : 123,
    'bagging_seed':123,
    'feature_fraction_seed':123,
    'verbose_eval' : 0
}

# LR
param_fixed_lr = {
    'n_jobs' : -1,
    'random_state' : 123,
    'verbose' : 0     
}

In [35]:
class StackingModels (BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models, base_models_na, base_feats, meta_model, if_feats_select, n_folds = n_folds):
        self.base_models = base_models
        self.base_models_na = base_models_na
        self.meta_model = meta_model
        self.if_feats_select = if_feats_select
        # base_feats
        self.base_feats = []
        if self.if_feats_select == True:
            self.base_feats = base_feats
        self.n_folds = n_folds
        self.meta_X = []

    def fit(self, X_ttl, Xna_ttl, y):
        self.base_models_ = [list() for x in self.base_models]
        self.base_models_na_ = [list() for x in self.base_models_na]
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=123)
        
        # Get results of basic models
        out_of_fold_predictions = np.zeros((X_ttl.shape[0], len(self.base_models)+len(self.base_models_na)))
        
        # Feature without Null
        for i, model in enumerate(self.base_models):
            ### If select features
            if self.if_feats_select == True:
                X = X_ttl[self.base_feats[i]]
            else:
                X = X_ttl
            l = len(X.columns) # Feature count
            print(f'*** Base Model: {model.__class__}, Features: {l} ***')
            
            j = 0
            # Train
            for train_index, valid_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X.iloc[train_index],  y.iloc[train_index])
                y_pred = instance.predict_proba(X.iloc[valid_index])[:,1]
                ks = ks_score(y.iloc[valid_index], y_pred)
                print(ks)
                out_of_fold_predictions[valid_index, i] = y_pred
                j += 1
        # Feature with Null
        for k, model in enumerate(self.base_models_na):
            i += 1
            ### If select features
            if self.if_feats_select == True:
                X = Xna_ttl[self.base_feats[i]]
            else:
                X = Xna_ttl
            l = len(X.columns) # Feature count
            print(f'*** Base Model Na: {model.__class__}, Features: {l} ***')
            
            j = 0
            # Train
            for train_index, valid_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_na_[k].append(instance)
                instance.fit(X.iloc[train_index],  y.iloc[train_index])
                y_pred = instance.predict_proba(X.iloc[valid_index])[:,1]
                ks = ks_score(y.iloc[valid_index], y_pred)
                print(ks)
                out_of_fold_predictions[valid_index, i] = y_pred
                j += 1        

        # META
        self.meta_X = out_of_fold_predictions
        # Polynomial
        poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
        X = poly.fit_transform(out_of_fold_predictions)
        l = len(X[0]) # Feature Count
        print(f'*** Meta Model: {self.meta_model.__class__}, Features: {l} ***')
        j = 0
        self.meta_models_ = []
        for train_index, valid_index in kfold.split(X, y):
            instance = clone(self.meta_model)
            self.meta_models_.append(instance)
            instance.fit(X[train_index],  y[train_index])
            y_pred = instance.predict_proba(X[valid_index])[:,1]
            ks = ks_score(y[valid_index], y_pred)
            print(ks)
            j += 1
        return self

    # Prediect
    def predict(self, X_ttl, Xna_ttl, if_meta=True):
        
        # Features for Meta Model       
        meta_features = pd.DataFrame()
        for i, models_kfold in enumerate(self.base_models_):
            # If select features
            if self.if_feats_select == True:
                X = X_ttl[self.base_feats[i]]
            else:
                X = X_ttl
            # Predict base models
            prob_kfold = pd.DataFrame()
            for j, model in enumerate(models_kfold):
                prob_kfold[j] = model.predict_proba(X)[:,1]
            meta_features[i] = prob_kfold.mean(axis=1)
        # Feature with Null
        for k, models_kfold in enumerate(self.base_models_na_):
            i += 1
            # If select features
            if self.if_feats_select == True:
                X = Xna_ttl[self.base_feats[i]]
            else:
                X = Xna_ttl
            # Predict base models
            prob_kfold = pd.DataFrame()
            for j, model in enumerate(models_kfold):
                prob_kfold[j] = model.predict_proba(X)[:,1]
            meta_features[i] = prob_kfold.mean(axis=1)
        
        # Prediect
        if if_meta:
            # Polynomial
            poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
            X = poly.fit_transform(meta_features)
            prob_kfold = pd.DataFrame()
            for j, model in enumerate(self.meta_models_):
                prob_kfold[j] = model.predict_proba(X)[:,1]
            pred = prob_kfold.mean(axis=1)
        else:
            pred = meta_features.mean(axis=1)
        return pred
    
############ Stacking

### Base models

# Base Featues
''' Load Features '''
feats = np.load('./model/base_features.npy', allow_pickle=True).item()
for k, v in feats.items():
    print(f'{k}:{len(v)}')

# RF = RandomForestClassifier()
# XGB = XGBClassifier()
# LGB = LGBMClassifier()
# LR = LogisticRegression()

# RF
best_params_load = np.load('./model/base_rf.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_rf}
RF = RandomForestClassifier(**model_params)
# XGB
best_params_load = np.load('./model/base_xgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_xgb}
XGB = XGBClassifier(**model_params)
# LGB
best_params_load = np.load('./model/base_lgb.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lgb}
LGB = LGBMClassifier(**model_params)
LGBNA = clone(LGB)
# LR
best_params_load = np.load('./model/base_lr.npy', allow_pickle=True).item()
model_params = {**best_params_load, **param_fixed_lr}
LR = LogisticRegression(**model_params)
# Mean
MEAN = 'MEAN'

### Stacking
base_models_na = [LGBNA]

base_models = [LGB, RF, XGB, LR]
base_feats = [feats['lgb'], feats['rf'], feats['xgb'], feats['corr'], feats['lgbna']]
# base_models = [LR]
# base_feats = [feats['corr'], []]

meta_model = LR
CLF = StackingModels(base_models=base_models, base_models_na=base_models_na,
                     meta_model=meta_model, base_feats=base_feats, if_feats_select=True)

# Fit
CLF.fit(X, X1, y)

# # Predict
# pred = CLF.predict(X)
# ks_score(y, pred)

# # Save meta_X
# meta_X = pd.DataFrame(CLF.meta_X, columns=['lgb', 'rf', 'xgb', 'lgbna'])
# meta_X.shape
# meta_X.head()
# meta_X.to_csv('./model/x_meta.csv')

' Load Features '

corr:54
rf:198
xgb:811
lgb:811
lgbna:811
*** Base Model: <class 'lightgbm.sklearn.LGBMClassifier'>, Features: 811 ***
0.3688
0.3701
0.3493
0.364
0.3573
*** Base Model: <class 'sklearn.ensemble.forest.RandomForestClassifier'>, Features: 198 ***
0.3182
0.3103
0.3138
0.313
0.3123
*** Base Model: <class 'xgboost.sklearn.XGBClassifier'>, Features: 811 ***
0.3636
0.3598
0.3417
0.3618
0.3502
*** Base Model: <class 'sklearn.linear_model.logistic.LogisticRegression'>, Features: 54 ***
0.256
0.2493
0.2527
0.249
0.2467
*** Base Model Na: <class 'lightgbm.sklearn.LGBMClassifier'>, Features: 811 ***
0.3693
0.3667
0.3531
0.3647
0.3587
*** Meta Model: <class 'sklearn.linear_model.logistic.LogisticRegression'>, Features: 15 ***
0.372
0.3727
0.3558
0.371
0.3611


StackingModels(base_feats=[{'25岁以下', '25岁到30岁', '30岁到40岁', '40岁到50岁', '50岁以上',
                            'is_odue30sumsumsum', 'is_odue60sumsumsum',
                            'is_odue90sumsumsum', 'is_oduesumsumsum',
                            'time_diffmeanmaxmax', 'time_diffmeanmeanmean',
                            '一月内换过手机号', '产品类型_0mean_x', '产品类型_0mean_y',
                            '产品类型_0sum_x', '产品类型_0sum_y', '产品类型_1mean_x',
                            '产品类型_1mean_y', '产品类型_1sum_x', '产品类型_1sum_y',
                            '产品类型_2mean_x', '产品类型_...
                                              random_state=123, reg_alpha=0.0,
                                              reg_lambda=0.0, silent=True,
                                              subsample=0.8, ...)],
               if_feats_select=True,
               meta_model=LogisticRegression(C=0.1, class_weight=1, dual=False,
                                             fit_intercept=True,
                         

In [37]:
############## Test

def norm_score(score):
    if score < 0:
        score = 0
    elif score > 1:
        score = 1
    return score

test = pd.read_csv('./tmp/A_d1234_nona.csv', header=0, index_col=0)
X_test = test.drop(uid, axis=1)
X_test.shape
test_na = pd.read_csv('./tmp/A_d1234_na.csv', header=0, index_col=0)
X_test_na = test_na.drop(uid, axis=1)
X_test_na.shape

# Predict
scores = CLF.predict(X_test, X_test_na)

# Output
uids = test[uid]
out = pd.concat([uids, pd.Series(scores)], axis=1)
out[0] = out[0].apply(norm_score)
out.head()
out.to_csv('./model/predict.csv', header=False, index=False)

(21469, 811)

(21469, 811)

Unnamed: 0,申请编号,0
0,4,0.240715
1,6,0.585614
2,16,0.068014
3,17,0.122484
4,38,0.043567


In [7]:
# # ############## Predict Train

# def norm_score(score):
#     if score < 0:
#         score = 0
#     elif score > 1:
#         score = 1
#     return score

# # Predict
# scores = CLF.predict(Xid)

# # Output
# uids = Xid[uid]
# out = pd.concat([uids, pd.Series(scores)], axis=1)
# out[0] = out[0].apply(norm_score)
# out.head()
# out.to_csv('./model/predict_train.csv')

In [8]:
# ####### Distributions

# # Final
# sns.distplot(pred)
# sns.distplot(scores)
# plt.show()

# meta_X = CLF.meta_X
# meta_X.shape
# meta_X[0]

# # Base
# for i in range(3):
#     i
#     sns.distplot(meta_X[:,i])

In [9]:
# ############## Ana

# base_models = [LGB, RF, XGB]
# base_models_ = [list() for x in base_models]
# kfold = KFold(n_splits=2, shuffle=True, random_state=123)

# out_of_fold_predictions = np.zeros((X.shape[0], len(base_models)))
# for i, model in enumerate(base_models):
#     j = 0
#     for train_index, valid_index in kfold.split(X, y):
#         instance = clone(model)
#         base_models_[i].append(instance)
#         instance.fit(X.loc[train_index],  y.loc[train_index])
#         len(X.loc[train_index])
#         y_pred = instance.predict_proba(X.loc[valid_index])[:,1]
#         ks = ks_score(y.loc[valid_index], y_pred)
#         print(f'* KS({j}):{ks} *')
#         out_of_fold_predictions[valid_index, i] = y_pred
#         j += 1

In [10]:
# len(out_of_fold_predictions)
# len(out_of_fold_predictions[0])
# out_of_fold_predictions[0]

In [11]:
# meta_model = LR
# meta_model_ = clone(meta_model)

# meta_model_.fit(out_of_fold_predictions, y)
# y_pred = meta_model_.predict_proba(out_of_fold_predictions)[:,1]
# ks_score(y, y_pred)

In [12]:
# meta_features = np.column_stack ([
#     np.column_stack(
#         [model.predict_proba(X)[:,1] for model in base_models]
#     ).mean (axis=1)
#     for base_models in base_models_])

In [13]:
# len(meta_features)
# len(meta_features[0])
# meta_features[0]

In [14]:
# meta_features = pd.DataFrame()
# for i, models_kfold in enumerate(base_models_):
#     i
#     prob_kfold = pd.DataFrame()
#     for j, model in enumerate(models_kfold):
#         j
#         prob_kfold[j] = model.predict_proba(X)[:,1]
#         prob_kfold.shape
#     prob_kfold.head()
#     meta_features[i] = prob_kfold.mean(axis=1)

In [15]:
# meta_features.shape
# meta_features.head()

In [16]:
# ''' RF '''
# RF.fit(meta_features, y)
# ks_score(y, RF.predict_proba(meta_features)[:,1])

# ''' RF1 '''
# RF1 = RandomForestClassifier()
# RF1.fit(meta_features, y)
# ks_score(y, RF1.predict_proba(meta_features)[:,1])

# ''' LGB '''
# LGB.fit(meta_features, y)
# ks_score(y, LGB.predict_proba(meta_features)[:,1])

# ''' LR '''
# LR.fit(meta_features, y)
# ks_score(y, LR.predict_proba(meta_features)[:,1])

# ''' MEAN '''
# # LR.fit(meta_features, y)
# ks_score(y, meta_features.mean(axis=1))