In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
import lightgbm as lgb

In [3]:
def train_model_pre(deal_data_func):
    X_train,y_train = deal_data_func('train')
#     X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    estimator = lgb.LGBMClassifier()
    param_grid = {
        'learning_rate': [0.01, 0.05,0.1],
        'n_estimators': [20, 40, 60],
        'num_leaves':[30,40]
    }
    gbm = GridSearchCV(estimator, param_grid)
    model = gbm.fit(X_train, y_train)
    print(model.best_params_)
    print(model.best_score_)
#     gbm = lgb.LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=20)
#     model  = gbm.fit(X_train, y_train, eval_set=[(X_test, y_test.ravel() )], eval_metric='l1', early_stopping_rounds=5)
    x_test,y_test = deal_data_func('test')
    print('_*'*60)
    pre = model.predict(x_test)
    print((pre==y_test).mean())
    print(mean_squared_error(pre,y_test))
    return model

# 删除空值/不删除

In [4]:
def deal_data(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    data  =data.loc(axis=1)[~ data.isna().any()]
    x = data
    y = x.pop('loan_status')
    return x,y

mdeol = train_model_pre(deal_data)

{'learning_rate': 0.05, 'n_estimators': 60, 'num_leaves': 30}
0.91844
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91724
0.08276


In [5]:
def deal_data(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    x = data
    y = x.pop('loan_status')
    return x,y

mdeol = train_model_pre(deal_data)

{'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 40}
0.9193200000000001
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91694
0.08306


# 将空值变成0,衍生一个是否是空值的Bool

In [6]:
def deal_data_2(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    data_na  =data.loc(axis=1)[data.isna().any()]
    for column_na in data_na.columns:
        data[f'{column_na}_nan'] = pd.get_dummies(data_na[column_na]).sum(axis=1)
    data = data.fillna(0)
    x = data
    y = x.pop('loan_status')
    return x,y

mdeol2 = train_model_pre(deal_data_2)

{'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 30}
0.9191199999999998
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91778
0.08222


# 将比较的大的数值log

In [7]:
def deal_data_3(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    data_na  =data.loc(axis=1)[data.isna().any()]
    for column_na in data_na.columns:
        data[f'{column_na}_nan'] = 1-pd.get_dummies(data_na[column_na]).sum(axis=1)
    data = data.fillna(0)
    data_max=data.loc(axis=1)[data.max()>100]
    for column_max in data_max.columns:
        data[f'{column_max}_log'] = np.log(data[column_max]+1)
    x = data
    y = x.pop('loan_status')
    return x,y

mdeol3 = train_model_pre(deal_data_3)

{'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 30}
0.9190999999999999
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91778
0.08222


# 数据衍生

In [8]:
def deal_data_4(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    data_na  =data.loc(axis=1)[data.isna().any()]
    for column_na in data_na.columns:
        data[f'{column_na}_nan'] = 1-pd.get_dummies(data_na[column_na]).sum(axis=1)
    data = data.fillna(0)
    data_max=data.loc(axis=1)[data.max()>100]
    for column_max in data_max.columns:
        data[f'{column_max}_log'] = np.log(data[column_max]+1)
        
    
    x = data
    y = x.pop('loan_status')
    return x,y

mdeol4 = train_model_pre(deal_data_4)

{'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 30}
0.9190999999999999
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91778
0.08222


In [9]:
def deal_data_5(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    

    data['derive_1'] = data['continuous_fico_range_low']/data['continuous_fico_range_high']
    data['derive_2'] = data['continuous_fico_range_low']/data['continuous_dti']
    data['derive_3'] = data['continuous_fico_range_high']/data['continuous_dti']
    x = data
    y = x.pop('loan_status')
    return x,y

mdeol5 = train_model_pre(deal_data_5)

{'learning_rate': 0.05, 'n_estimators': 60, 'num_leaves': 30}
0.9190400000000001
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91742
0.08258


In [10]:
def deal_data_6(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    
    data['derive_1'] = data['continuous_fico_range_low']/data['continuous_fico_range_high']
    data['derive_2'] = data['continuous_fico_range_low']/data['continuous_dti']
    data['derive_3'] = data['continuous_fico_range_high']/data['continuous_dti']
    data['derive_4'] = data['continuous_funded_amnt_inv']/data['continuous_funded_amnt']/data['continuous_inq_last_6mths']

    x = data
    y = x.pop('loan_status')
    return x,y

mdeol6 = train_model_pre(deal_data_6)

{'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 30}
0.91906
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91786
0.08214


In [11]:
def deal_data_7(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    
    data['derive_1'] = data['continuous_fico_range_low']/data['continuous_fico_range_high']
    data['derive_2'] = data['continuous_fico_range_low']/data['continuous_dti']
    data['derive_3'] = data['continuous_fico_range_high']/data['continuous_dti']
    data['derive_4'] = data['continuous_funded_amnt_inv']/data['continuous_funded_amnt']/data['continuous_inq_last_6mths']
    data['derive_5'] = np.log(data['continuous_pub_rec']/data['continuous_open_acc']+1)

    x = data
    y = x.pop('loan_status')
    return x,y

mdeol7 = train_model_pre(deal_data_7)

{'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 30}
0.9192600000000001
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91752
0.08248


In [12]:
def deal_data_8(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    data_na  =data.loc(axis=1)[data.isna().any()]
    for column_na in data_na.columns:
        data[f'{column_na}_nan'] = 1-pd.get_dummies(data_na[column_na]).sum(axis=1)
    data = data.fillna(0)
    data_max=data.loc(axis=1)[data.max()>100]
    for column_max in data_max.columns:
        data[f'{column_max}_log'] = np.log(data[column_max]+1)
    data['derive_1'] = data['continuous_fico_range_low']/data['continuous_fico_range_high']
    data['derive_2'] = data['continuous_fico_range_low']/data['continuous_dti']
    data['derive_3'] = data['continuous_fico_range_high']/data['continuous_dti']
#     data['derive_4'] = data['continuous_funded_amnt_inv']/data['continuous_funded_amnt']/data['continuous_inq_last_6mths']

    x = data
    y = x.pop('loan_status')
    return x,y

mdeol8 = train_model_pre(deal_data_8)

{'learning_rate': 0.05, 'n_estimators': 60, 'num_leaves': 30}
0.9196799999999999
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91762
0.08238


# 总结

In [13]:
def deal_data_best(text):
    data = pd.read_csv(f'./final/{text}_final.csv')
    
    data['derive_1'] = data['continuous_fico_range_low']/data['continuous_fico_range_high']
    data['derive_2'] = data['continuous_fico_range_low']/data['continuous_dti']
    data['derive_3'] = data['continuous_fico_range_high']/data['continuous_dti']
    data['derive_4'] = data['continuous_funded_amnt_inv']/data['continuous_funded_amnt']/data['continuous_inq_last_6mths']

    x = data
    y = x.pop('loan_status')
    return x,y

mdeolbest = train_model_pre(deal_data_best)

{'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 30}
0.91906
_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*_*
0.91786
0.08214


In [14]:
data = pd.read_csv(f'./final/{"train"}_final.csv')

In [15]:
data[['continuous_annual_inc', 'continuous_annual_inc_joint',
       'continuous_delinq_2yrs', 'continuous_dti', 'continuous_dti_joint',
       'continuous_fico_range_high', 'continuous_fico_range_low',
       'continuous_funded_amnt', 'continuous_funded_amnt_inv',
       'continuous_inq_last_6mths', 'continuous_installment',
       'continuous_int_rate']].describe()

Unnamed: 0,continuous_annual_inc,continuous_annual_inc_joint,continuous_delinq_2yrs,continuous_dti,continuous_dti_joint,continuous_fico_range_high,continuous_fico_range_low,continuous_funded_amnt,continuous_funded_amnt_inv,continuous_inq_last_6mths,continuous_installment,continuous_int_rate
count,50000.0,220.0,50000.0,49999.0,220.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,78358.86,109248.802545,0.34738,19.080239,18.831864,698.40446,694.4043,14332.5365,14325.533,0.62082,428.892109,11.986125
std,97830.53,52319.230212,0.91874,9.802002,7.586033,31.180915,31.180139,8617.58487,8612.853833,0.906035,254.569184,4.182402
min,0.0,28000.0,0.0,0.0,3.0,664.0,660.0,1000.0,950.0,0.0,14.77,5.32
25%,47000.0,76000.0,0.0,12.41,13.805,674.0,670.0,7775.0,7750.0,0.0,244.2425,9.17
50%,65000.0,99000.0,0.0,18.52,17.94,689.0,685.0,12000.0,12000.0,0.0,369.52,11.49
75%,94000.0,132700.0,0.0,25.32,23.4775,714.0,710.0,20000.0,20000.0,1.0,572.85,14.33
max,9000000.0,500000.0,15.0,999.0,43.86,850.0,845.0,35000.0,35000.0,5.0,1354.66,28.99


In [16]:
data[['continuous_last_fico_range_high',
       'continuous_last_fico_range_low', 'continuous_loan_amnt',
       'loan_status', 'continuous_mths_since_last_delinq',
       'continuous_mths_since_last_major_derog',
       'continuous_mths_since_last_record', 'continuous_open_acc',
       'continuous_pub_rec',]].describe()

Unnamed: 0,continuous_last_fico_range_high,continuous_last_fico_range_low,continuous_loan_amnt,loan_status,continuous_mths_since_last_delinq,continuous_mths_since_last_major_derog,continuous_mths_since_last_record,continuous_open_acc,continuous_pub_rec
count,50000.0,50000.0,50000.0,50000.0,26083.0,15052.0,9495.0,50000.0,50000.0
mean,675.64486,658.4284,14332.5365,0.79576,34.197485,44.214589,65.50079,11.94496,0.25282
std,81.571458,133.005552,8617.58487,0.403149,21.828204,21.352291,23.659021,5.676976,0.665455
min,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,624.0,620.0,7775.0,1.0,16.0,27.0,51.0,8.0,0.0
50%,689.0,685.0,12000.0,1.0,31.0,44.0,67.0,11.0,0.0
75%,734.0,730.0,20000.0,1.0,50.0,63.0,81.0,15.0,0.0
max,850.0,845.0,35000.0,1.0,152.0,152.0,120.0,67.0,23.0
