# 1 导包读取数据

In [1]:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns

# modelling
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler

In [2]:
df = pd.read_csv("./data/datapandaszhibiao.csv")
df

Unnamed: 0,Default,age,card_age,cashAmt_mean,cashAmt_non_null_months,cashCnt_mean,cashCnt_non_null_months,cashTotalAmt,cashTotalCnt,inCourt,...,sex_2,CityId_1,CityId_2,CityId_3,trans_total,total_withdraw,avg_per_withdraw,avg_per_online_spend,avg_per_public_spend,bad_record
0,0,38,2,0.000000,0,0.000000,0,0,0,0,...,1,1,0,0,0.000000,0.000000,0.000000,-3855.000000,0.00,0
1,0,39,19,0.000000,0,0.000000,0,0,0,0,...,0,1,0,0,180.000000,0.000000,0.000000,233.333333,0.00,0
2,0,40,16,0.000000,0,0.000000,0,0,0,0,...,1,0,1,0,792.962962,0.000000,0.000000,-4219.487179,385.00,0
3,0,38,13,22000.000000,1,6.000000,1,22000,6,0,...,0,0,0,1,42012.000000,132000.000000,3666.666667,1808.571429,0.00,0
4,0,24,8,0.000000,0,0.000000,0,0,0,0,...,0,0,1,0,1515.600000,0.000000,0.000000,-350.666667,521.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,0,30,12,0.000000,0,0.000000,0,0,0,0,...,0,1,0,0,7340.937500,0.000000,0.000000,1639.000000,0.00,0
421,0,40,47,0.000000,0,0.000000,0,0,0,0,...,1,0,1,0,0.000000,0.000000,0.000000,0.000000,0.00,0
422,0,30,70,100.000000,1,1.000000,1,100,1,0,...,0,0,0,1,0.000000,100.000000,100.000000,0.000000,0.00,0
423,0,28,69,0.000000,0,0.000000,0,0,0,0,...,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.00,0


# 2 划分训练集测试集

In [3]:
#Splitting the dataset
def Split_data(df):
    training_size = int(len(df)*0.90)
    data_len = len(df)
    train, test = df[0:training_size],df[training_size:data_len] 
    return train, test
#Splitting the training and test datasets 
df_train, df_test = Split_data(df)

# 3 训练模型 

In [4]:
from sklearn.metrics import make_scorer
# metric for evaluation
def rmse(y_true, y_pred):
    diff = y_pred - y_true
    sum_sq = sum(diff**2)    
    n = len(y_pred)   
    return np.sqrt(sum_sq/n)

def mse(y_ture,y_pred):
    return mean_squared_error(y_ture,y_pred)

# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mse, greater_is_better=False)

In [5]:
features_columns = [col for col in df.columns if col not in ['Default']]
df_train_X = df_train[features_columns].values
df_train_y =df_train['Default'].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_train_X, df_train_y, test_size=0.7, random_state=0)

In [7]:
# copy the train data
def copy_trainning_data():
    X1=X_train.copy()
    y1=y_train.copy()
    return X1,y1


from sklearn.preprocessing import StandardScaler

# function to  train the model
def train_model(model, param_grid=[], X=[], y=[], 
                splits=5, repeats=5):

    # get unmodified training data, unless data to use already specified
    if len(y)==0:
        X,y = copy_trainning_data()
        #poly_trans=PolynomialFeatures(degree=2)
        #X=poly_trans.fit_transform(X)
        #X=MinMaxScaler().fit_transform(X)
    
    # create cross-validation method
    rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)
    
    # perform a grid search if param_grid given
    if len(param_grid)>0:
        # setup grid search parameters
        gsearch = GridSearchCV(model, param_grid, cv=rkfold,
                               scoring="neg_mean_squared_error",
                               verbose=1, return_train_score=True)

        # search the grid
        gsearch.fit(X,y)

        # extract best model from the grid
        model = gsearch.best_estimator_  # 最优模型      
        best_idx = gsearch.best_index_ # 最优超参数组合的索引值

        # get cv-scores for best model
        grid_results = pd.DataFrame(gsearch.cv_results_) # 网格搜索过程中每个超参数组合的详细统计数据，例如训练时间、验证得分等。      
        cv_mean = abs(grid_results.loc[best_idx,'mean_test_score']) # 平均测试得分
        cv_std = grid_results.loc[best_idx,'std_test_score'] # 测试得分标准差

    # no grid search, just cross-val score for given model    
    else:
        grid_results = []
        cv_results = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=rkfold)
        cv_mean = abs(np.mean(cv_results))
        cv_std = np.std(cv_results)
    
    # combine mean and std cv-score in to a pandas series
    cv_score = pd.Series({'mean':cv_mean,'std':cv_std}) # return cv_score

    # predict y using the fitted model
    y_pred = model.predict(X)
    
    # print stats on model performance         
    print('----------------------')
    print(model)
    print('----------------------')
    print('score=',model.score(X,y))
    print('rmse=',rmse(y, y_pred))
    print('mse=',mse(y, y_pred))
    print('cross_val: mean=',cv_mean,', std=',cv_std)
    
    '''  
    # 误差
    # residual plots
    y_pred = pd.Series(y_pred,index=y.index)
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid = resid.std()
    z = (resid - mean_resid)/std_resid    
    n_outliers = sum(abs(z)>3)
    
       
    plt.figure(figsize=(15,5))
    ax_131 = plt.subplot(1,3,1)
    plt.plot(y,y_pred,'.')
    plt.xlabel('y')
    plt.ylabel('y_pred');
    plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
    ax_132=plt.subplot(1,3,2)
    plt.plot(y,y-y_pred,'.')
    plt.xlabel('y')
    plt.ylabel('y - y_pred');
    plt.title('std resid = {:.3f}'.format(std_resid))
    
    ax_133=plt.subplot(1,3,3)
    z.plot.hist(bins=50,ax=ax_133)
    plt.xlabel('z')
    plt.title('{:.0f} samples with z>3'.format(n_outliers))
'''

    return model, cv_score, grid_results

In [8]:
# places to store optimal models and scores
opt_models = dict()
score_models = pd.DataFrame(columns=['mean','std'])

# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5

In [9]:
model = 'Ridge'

opt_models[model] = Ridge()
alph_range = np.arange(0.25,6,0.25) # arange生成0.25到6之间以0.25为步长的值
param_grid = {'alpha': alph_range}

opt_models[model],cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)

'''
# 用于表现有一定置信区间的带误差数据
plt.figure() 
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),
             abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
'''


Fitting 25 folds for each of 23 candidates, totalling 575 fits


----------------------
Ridge()
----------------------
score= 0.9400849157433285
rmse= 0.09309838784104316
mse= 0.008667309818601296
cross_val: mean= 0.0857424560908498 , std= 0.08626728022588105


"\n# 用于表现有一定置信区间的带误差数据\nplt.figure() \nplt.errorbar(alph_range, abs(grid_results['mean_test_score']),\n             abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))\nplt.xlabel('alpha')\nplt.ylabel('score')\n"

In [10]:
model = 'Lasso'

opt_models[model] = Lasso()
alph_range = np.arange(1e-4,1e-3,4e-5)
param_grid = {'alpha': alph_range}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)
'''
plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')
'''


Fitting 25 folds for each of 23 candidates, totalling 575 fits
----------------------
Lasso(alpha=0.0009800000000000002)
----------------------
score= 0.9422180303504408
rmse= 0.09142611050876945
mse= 0.008358733682761722
cross_val: mean= 0.05352218983974719 , std= 0.04974022262207061


"\nplt.figure()\nplt.errorbar(alph_range, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))\nplt.xlabel('alpha')\nplt.ylabel('score')\n"

In [11]:
model ='ElasticNet'
opt_models[model] = ElasticNet()

param_grid = {'alpha': np.arange(1e-4,1e-3,1e-4),
              'l1_ratio': np.arange(0.1,1.0,0.1),
              'max_iter':[100000]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
----------------------
ElasticNet(alpha=0.0009000000000000001, l1_ratio=0.9, max_iter=100000)
----------------------
score= 0.9426598311836597
rmse= 0.09107591789524638
mse= 0.00829482282046166
cross_val: mean= 0.038553304586297424 , std= 0.02616136394906859


In [12]:
'''
model='LinearSVR'
opt_models[model] = LinearSVR()

crange = np.arange(0.1,1.0,0.1)
param_grid = {'C':crange,
             'max_iter':[1000]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=repeats)

cv_score.name = model
score_models = score_models.append(cv_score)

plt.figure()
plt.errorbar(crange, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('C')
plt.ylabel('score')
'''

"\nmodel='LinearSVR'\nopt_models[model] = LinearSVR()\n\ncrange = np.arange(0.1,1.0,0.1)\nparam_grid = {'C':crange,\n             'max_iter':[1000]}\n\nopt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, \n                                              splits=splits, repeats=repeats)\n\n\ncv_score.name = model\nscore_models = score_models.append(cv_score)\n\n\nplt.figure()\nplt.errorbar(crange, abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))\nplt.xlabel('C')\nplt.ylabel('score')\n"

In [13]:
model = 'KNeighbors'
opt_models[model] = KNeighborsRegressor()

param_grid = {'n_neighbors':np.arange(3,11,1)}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)
''' 
plt.figure()
plt.errorbar(np.arange(3,11,1), abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*1))
plt.xlabel('n_neighbors')
plt.ylabel('score')
'''

Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------------------
KNeighborsRegressor(n_neighbors=8)
----------------------
score= 0.10463763297872342
rmse= 0.3598930762265224
mse= 0.12952302631578946
cross_val: mean= 0.15515069169960474 , std= 0.06684715280664362


" \nplt.figure()\nplt.errorbar(np.arange(3,11,1), abs(grid_results['mean_test_score']),abs(grid_results['std_test_score'])/np.sqrt(splits*1))\nplt.xlabel('n_neighbors')\nplt.ylabel('score')\n"

In [14]:
model = 'GradientBoosting'
opt_models[model] = GradientBoostingRegressor()

param_grid = {'n_estimators':[150,250,350],
              'max_depth':[1,2,3],
              'min_samples_split':[5,6,7]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------------------
GradientBoostingRegressor(max_depth=1, min_samples_split=6, n_estimators=150)
----------------------
score= 0.945909218569796
rmse= 0.08845771182448547
mse= 0.007824766781223721
cross_val: mean= 0.026355373216338802 , std= 0.018521933458605495


In [15]:
model = 'XGB'
opt_models[model] = XGBRegressor()

param_grid = {'n_estimators':[100,200,300,400,500],
              'max_depth':[1,2,3],
             }

opt_models[model], cv_score,grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=splits, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
----------------------
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=1, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)
----------------------
score= 0.9579453169921438
rmse= 0.07799760280287793
mse= 0.006083626042995508
cross_val: mean= 0.0274748581137714 , std= 0.017328297778460715


In [16]:
model = 'RandomForest'
opt_models[model] = RandomForestRegressor()

param_grid = {'n_estimators':[100,150,200],
              'max_features':[8,12,16,20,24],
              'min_samples_split':[2,4,6]}

opt_models[model], cv_score, grid_results = train_model(opt_models[model], param_grid=param_grid, 
                                              splits=5, repeats=1)

cv_score.name = model
score_models = score_models.append(cv_score)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
----------------------
RandomForestRegressor(max_features=24, min_samples_split=6, n_estimators=200)
----------------------
score= 0.9388747568728546
rmse= 0.09403388364926685
mse= 0.008842371274163853
cross_val: mean= 0.02617793846808731 , std= 0.014079154261490636


In [17]:
def model_predict(test_data,test_y=[],stack=False):
    #poly_trans=PolynomialFeatures(degree=2)
    #test_data1=poly_trans.fit_transform(test_data)
    #test_data=MinMaxScaler().fit_transform(test_data)
    i=0
    y_predict_total=np.zeros((test_data.shape[0],))
    if stack:
        for model in opt_models.keys():
            y_predict=opt_models[model].predict(test_data)
            y_predict_total+=y_predict
            i+=1
            if len(test_y)>0:
                print("{}_mse:".format(model),mean_squared_error(y_predict,test_y)) # 每个基模型的均方误差
        y_predict_mean=np.round(y_predict_total/i,3) # 所有模型预测结果的平均值
        if len(test_y)>0:
            print("mean_mse:",mean_squared_error(y_predict_mean,test_y))
        else:
            y_metal_mean=pd.Series(y_predict_mean)
            return y_metal_mean 
    else:
        for model in opt_models.keys():
            if model!="LinearSVR" and model!="KNeighbors":
                y_predict=opt_models[model].predict(test_data)
                y_predict_total+=y_predict
                i+=1
            if len(test_y)>0:
                print("{}_mse:".format(model),mean_squared_error(y_predict,test_y))
        y_predict_mean=np.round(y_predict_total/i,3)
        if len(test_y)>0:
            print("mean_mse:",mean_squared_error(y_predict_mean,test_y))
        else:
            y_predict_mean=pd.Series(y_predict_mean)
            return y_predict_mean

In [18]:
model_predict(X_test,y_test)

Ridge_mse: 0.029472253778305948
Lasso_mse: 0.024796127390697847
ElasticNet_mse: 0.025058909048249357
KNeighbors_mse: 0.025058909048249357
GradientBoosting_mse: 0.021194475645041948
XGB_mse: 0.021220163054524014
RandomForest_mse: 0.021479575532643493
mean_mse: 0.020518070895522385


# 4 测试集测试

In [20]:
df_test_X = df_test[features_columns].values
df_test_y =df_test['Default'].values

In [21]:
result = model_predict(df_test_X,df_test_y,stack=True)

Ridge_mse: 0.2790389516694681
Lasso_mse: 0.2958808398941324
ElasticNet_mse: 0.29536873008424697
KNeighbors_mse: 0.24781976744186046
GradientBoosting_mse: 0.2989346256279501
XGB_mse: 0.2995590910073359
RandomForest_mse: 0.24958923121747642
mean_mse: 0.26023758139534886
