In [1]:
pip install heamy

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install fancyimpute




In [3]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')

In [4]:
# import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
"""
sns 相关设置
@return:
"""
# 声明使用 Seaborn 样式
sns.set()
# 有五种seaborn的绘图风格，它们分别是：darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
sns.set_style("whitegrid")
# 有四个预置的环境，按大小从小到大排列分别为：paper, notebook, talk, poster。其中，notebook是默认的。
sns.set_context('talk')
# 中文字体设置-黑体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 解决Seaborn中文显示问题并调整字体大小
sns.set(font='SimHei')

In [5]:
# reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum()  / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum()  / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
train = pd.read_csv('./data_train_final.csv',encoding='gbk')
testA = pd.read_csv('./data_test_final.csv',encoding='gbk')

In [7]:
train["sample"] = "train"
testA["sample"] = "test"

In [8]:
# result = pd.merge(train, testA, on='id', how='left')
# result.to_csv('output.csv', index=False)

In [9]:
# data =  data.dropna(axis=1, how="all")

In [10]:
data = pd.concat([train, testA], axis=0, ignore_index=True)

In [11]:
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50


In [12]:
data.isnull().any(axis = 0)

id                             False
loanAmnt                       False
term                           False
interestRate                   False
installment                    False
grade                          False
subGrade                       False
employmentTitle                False
employmentLength               False
homeOwnership                  False
annualIncome                   False
verificationStatus             False
issueDate                      False
isDefault                       True
purpose                        False
postCode                       False
regionCode                     False
dti                            False
delinquency_2years             False
ficoRangeLow                   False
ficoRangeHigh                  False
openAcc                        False
pubRec                         False
pubRecBankruptcies             False
revolBal                       False
revolUtil                      False
totalAcc                       False
i

In [13]:
# 判断各变量中是否存在缺失值
data.isnull().any(axis = 0)
# 各变量中缺失值的数量
data.isnull().sum(axis = 0)
# 各变量中缺失值的比例
data.isnull().sum(axis = 0)/data.shape[0]


id                             0.000000
loanAmnt                       0.000000
term                           0.000000
interestRate                   0.000000
installment                    0.000000
grade                          0.000000
subGrade                       0.000000
employmentTitle                0.000000
employmentLength               0.000000
homeOwnership                  0.000000
annualIncome                   0.000000
verificationStatus             0.000000
issueDate                      0.000000
isDefault                      0.246081
purpose                        0.000000
postCode                       0.000000
regionCode                     0.000000
dti                            0.000000
delinquency_2years             0.000000
ficoRangeLow                   0.000000
ficoRangeHigh                  0.000000
openAcc                        0.000000
pubRec                         0.000000
pubRecBankruptcies             0.000000
revolBal                       0.000000


In [14]:
# 各变量中缺失值的数量
data.isnull().sum(axis = 0)

id                                  0
loanAmnt                            0
term                                0
interestRate                        0
installment                         0
grade                               0
subGrade                            0
employmentTitle                     0
employmentLength                    0
homeOwnership                       0
annualIncome                        0
verificationStatus                  0
issueDate                           0
isDefault                      200000
purpose                             0
postCode                            0
regionCode                          0
dti                                 0
delinquency_2years                  0
ficoRangeLow                        0
ficoRangeHigh                       0
openAcc                             0
pubRec                              0
pubRecBankruptcies                  0
revolBal                            0
revolUtil                           0
totalAcc    

In [15]:
# 各变量中缺失值的比例
data.isnull().sum(axis = 0)/data.shape[0]


id                             0.000000
loanAmnt                       0.000000
term                           0.000000
interestRate                   0.000000
installment                    0.000000
grade                          0.000000
subGrade                       0.000000
employmentTitle                0.000000
employmentLength               0.000000
homeOwnership                  0.000000
annualIncome                   0.000000
verificationStatus             0.000000
issueDate                      0.000000
isDefault                      0.246081
purpose                        0.000000
postCode                       0.000000
regionCode                     0.000000
dti                            0.000000
delinquency_2years             0.000000
ficoRangeLow                   0.000000
ficoRangeHigh                  0.000000
openAcc                        0.000000
pubRec                         0.000000
pubRecBankruptcies             0.000000
revolBal                       0.000000


In [16]:
# data[cat_features] = data[cat_features].fillna('NaN')
# data[cat_features] = data[cat_features].fillna('NaN')

In [17]:
data.fillna(data.mode(),inplace=True) # 填充众数,该数据缺失太多众数出现为nan的情况

In [18]:
"""读取数据"""
df_data = data
df_data = reduce_mem_usage(df_data)

Memory usage of dataframe is 706.88 MB
Memory usage after optimization is: 193.78 MB
Decreased by 72.6%


In [19]:
"""建立模型：【模型参数：xgb-->鱼佬baseline，lgb --> 贝叶斯调参】"""
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor 

def xgb_model(X_train, y_train, X_test, y_test=None):
    
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = xgb.DMatrix(X_train_split , label=y_train_split, enable_categorical=True)
    valid_matrix = xgb.DMatrix(X_val , label=y_val, enable_categorical=True)
    test_matrix = xgb.DMatrix(X_test, enable_categorical=True)

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 1,
        'min_child_weight': 1.5,
        'max_depth': 5,
        'lambda': 10,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'eta': 0.04,
        'tree_method': 'exact',
        'seed': 2020,
        'n_jobs': -1,
        "silent": True,
        "gpu_id": 0,
    }
    watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
    
    model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
    """计算在验证集上的得分"""
    val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后xgboost单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
    
    return test_pred
    

def lgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)
    
    # 调参后的最优参数
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'min_child_weight': 0.32,
        'num_leaves': 14,
        'max_depth': 4,
        'feature_fraction': 0.81,
        'bagging_fraction': 0.61,
        'bagging_freq': 9,
        'min_data_in_leaf': 13,
        'min_split_gain': 0.27,
        'reg_alpha': 9.58,
        'reg_lambda': 4.62,
        'seed': 2020,
        'n_jobs':-1,
        'silent': True,
        'verbose': -1,
    }
    
    model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
    """计算在验证集上的得分"""
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    return test_pred



def catboost_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    model = CatBoostRegressor(
        iterations=50000,
        learning_rate=0.01,
        l2_leaf_reg=3,
        depth=6,
        loss_function='RMSE',
        eval_metric='RMSE',
        random_seed=2022,
        od_type="Iter",
        od_wait=500,
        verbose=500,
    )

    model.fit(
        X_train_split,
        y_train_split,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=500,
        verbose=500,
    )

    """计算在验证集上的得分"""
    val_pred = model.predict(X_val)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后CatBoost单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(X_test)

    return test_pred


In [20]:
# df_data.info(verbose=True)

In [21]:
#随机从rs数据集中抽取1%的行数据
df_data = df_data.sample(frac=0.01,axis=0)


In [22]:
"""对训练集数据进行划分，分成训练集和验证集，并进行相应的操作"""
from sklearn.model_selection import train_test_split

"""数据集设置"""
X_train = df_data.loc[df_data['sample']=='train', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)
X_test = df_data.loc[df_data['sample']=='test', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)
y_train = df_data.loc[df_data['sample']=='train', 'isDefault']
# 数据集划分
# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)


In [23]:
cat_cols = X_train.select_dtypes(include=['category']).columns.tolist()
print(len(cat_cols))

0


In [24]:
from heamy.dataset import Dataset
from heamy.estimator import Classifier

model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)
model_cat = Classifier(dataset=model_dataset, estimator=catboost_model, name='cat', use_cache=False)

In [25]:
round(X_train.isnull().sum() / X_train.shape[0] *100.00,2)

loanAmnt                         0.0
term                             0.0
interestRate                     0.0
installment                      0.0
grade                            0.0
subGrade                         0.0
employmentTitle                  0.0
employmentLength                 0.0
homeOwnership                    0.0
annualIncome                     0.0
verificationStatus               0.0
purpose                          0.0
postCode                         0.0
regionCode                       0.0
dti                              0.0
delinquency_2years               0.0
ficoRangeLow                     0.0
ficoRangeHigh                    0.0
openAcc                          0.0
pubRec                           0.0
pubRecBankruptcies               0.0
revolBal                         0.0
revolUtil                        0.0
totalAcc                         0.0
initialListStatus                0.0
applicationType                  0.0
earliesCreditLine                0.0
t

In [26]:
from heamy.pipeline import ModelsPipeline

pipeline = ModelsPipeline(model_xgb, model_cat, model_lgb)
pipeline

<heamy.pipeline.ModelsPipeline at 0x1d98fdddca0>

In [27]:
# 构建第一层新特征，其中k默认是5，表示5折交叉验证，full_test=True，对全部训练集进行训练得到基学习器，然后用基学习器对测试集预测得到新特征
stack_ds = pipeline.stack(k=5, seed=111, full_test=True)

Parameters: { "silent" } are not used.

[0]	train-auc:0.65869	eval-auc:0.63451
[200]	train-auc:0.91833	eval-auc:0.69782
[242]	train-auc:0.93863	eval-auc:0.69401
调参后xgboost单模型在验证集上的AUC：0.7129878955736779
Parameters: { "silent" } are not used.

[0]	train-auc:0.70519	eval-auc:0.65631
[200]	train-auc:0.92066	eval-auc:0.69518
[287]	train-auc:0.95243	eval-auc:0.69338
调参后xgboost单模型在验证集上的AUC：0.7111621648702423
Parameters: { "silent" } are not used.

[0]	train-auc:0.66983	eval-auc:0.61215
[200]	train-auc:0.92022	eval-auc:0.68537
[311]	train-auc:0.96042	eval-auc:0.68067
调参后xgboost单模型在验证集上的AUC：0.6895361641554534
Parameters: { "silent" } are not used.

[0]	train-auc:0.67622	eval-auc:0.67436
[200]	train-auc:0.92226	eval-auc:0.69847
[310]	train-auc:0.96223	eval-auc:0.69074
调参后xgboost单模型在验证集上的AUC：0.7058145270686891
Parameters: { "silent" } are not used.

[0]	train-auc:0.67368	eval-auc:0.64188
[200]	train-auc:0.92173	eval-auc:0.70401
[328]	train-auc:0.96394	eval-auc:0.70362
调参后xgboost单模型在验证集上的AUC：0.70

[1000]	training's auc: 0.808603	valid_1's auc: 0.75368
Early stopping, best iteration is:
[910]	training's auc: 0.803146	valid_1's auc: 0.754461
调参后lightgbm单模型在验证集上的AUC：0.7544612620084317
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.783602	valid_1's auc: 0.706332
[1000]	training's auc: 0.813971	valid_1's auc: 0.707588
[1500]	training's auc: 0.829783	valid_1's auc: 0.711131
[2000]	training's auc: 0.842396	valid_1's auc: 0.711658
Early stopping, best iteration is:
[1952]	training's auc: 0.840611	valid_1's auc: 0.712181
调参后lightgbm单模型在验证集上的AUC：0.7120576725025747


In [28]:
from sklearn.linear_model import LogisticRegression
# 第二层使用逻辑回归进行stack
LogisticRegression(solver='lbfgs')
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred = stacker.predict()
test_pred

array([0.13280599, 0.08156378, 0.10895408, ..., 0.0846414 , 0.17596171,
       0.11715986])

In [29]:
"""生成提交格式的DataFrame"""
df_result = pd.DataFrame({'id': df_data.loc[df_data['sample']=='test', 'id'].values, 'isDefault': test_pred})
df_result.sort_values(by='id').head(20)

Unnamed: 0,id,isDefault
7,800020,0.088528
1895,800238,0.097526
1842,800247,0.077351
29,800520,0.092606
1173,800652,0.113584
125,800660,0.094978
1705,800780,0.081297
1728,801107,0.135039
754,801237,0.089723
1559,801400,0.088421


In [30]:
"""保存数据用于预测建模"""
df_result.to_csv('./submission_data_stacking_model_20200924_V1_5folds.csv', encoding='gbk', index=False)

In [31]:
# 构建第一层新特征，将训练集切分成8:2，其中80%用于训练基学习器，20%用于构建新特征
blend_ds = pipeline.blend(proportion=0.2,seed=111)
# 第二层使用逻辑回归进行blend
blender = Classifier(dataset=blend_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred = blender.predict()
test_pred

Parameters: { "silent" } are not used.

[0]	train-auc:0.68368	eval-auc:0.65641
[200]	train-auc:0.92070	eval-auc:0.69783
[337]	train-auc:0.96954	eval-auc:0.68703
调参后xgboost单模型在验证集上的AUC：0.7023685937593223
0:	learn: 0.4009936	test: 0.3995053	best: 0.3995053 (0)	total: 3.66ms	remaining: 3m 2s
500:	learn: 0.3592399	test: 0.3861299	best: 0.3860656 (494)	total: 1.46s	remaining: 2m 24s
1000:	learn: 0.3366318	test: 0.3861047	best: 0.3860248 (562)	total: 3.12s	remaining: 2m 32s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.3860248404
bestIteration = 562

Shrink model to first 563 iterations.
调参后CatBoost单模型在验证集上的AUC：0.7016525673814676
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.786821	valid_1's auc: 0.690931
[1000]	training's auc: 0.814006	valid_1's auc: 0.687307
Early stopping, best iteration is:
[520]	training's auc: 0.788686	valid_1's auc: 0.691935
调参后lightgbm单模型在验证集上的AUC：0.6919350341558622


array([0.15312777, 0.08753736, 0.11056236, ..., 0.08886997, 0.1536042 ,
       0.13897994])

In [32]:
"""生成提交格式的DataFrame"""
df_result = pd.DataFrame({'id': df_data.loc[df_data['sample']=='test', 'id'].values, 'isDefault': test_pred})
df_result.sort_values(by='id').head()

Unnamed: 0,id,isDefault
7,800020,0.097779
1895,800238,0.087353
1842,800247,0.088185
29,800520,0.094934
1173,800652,0.11215


In [33]:
"""保存数据用于预测建模"""
df_result.to_csv('./submission_data_blending_model_20200924_V1.csv', encoding='gbk', index=False)

In [None]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameters to tune and their search ranges
params = {
    'max_depth': (3, 10),
    'min_child_weight': (1, 10),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
    'gamma': (0, 1),
    'alpha': (0, 10),
    'learning_rate': (0.01, 0.1),
    'n_estimators': (50, 200)
}

# Define the objective function to optimize
def objective(max_depth, min_child_weight, subsample, colsample_bytree, gamma, alpha, learning_rate, n_estimators):
    # Define the XGBoost classifier with the current hyperparameters
    clf = xgb.XGBClassifier(max_depth=int(max_depth),
                            min_child_weight=int(min_child_weight),
                            subsample=subsample,
                            colsample_bytree=colsample_bytree,
                            gamma=gamma,
                            alpha=alpha,
                            learning_rate=learning_rate,
                            n_estimators=int(n_estimators),
                            random_state=42)

    # Fit the classifier on the training data
    clf.fit(X_train, y_train)

    # Compute the predictions on the validation data
    y_pred_val = clf.predict_proba(X_val)[:, 1]

    # Compute the AUC score on the validation data
    score = roc_auc_score(y_val, y_pred_val)

    # Return the negative score (to be maximized by the optimizer)
    return -score

# Define the optimizer and run the optimization
optimizer = BayesianOptimization(f=objective, pbounds=params, random_state=42)
optimizer.maximize(init_points=5, n_iter=20)

# Print the best hyperparameters and score found by the optimizer
print('Best hyperparameters:', optimizer.max['params'])
print('Best score:', -optimizer.max['target'])