In [83]:
import pandas as pd
import lightgbm as lgb
import numpy as np

In [84]:
train_data = pd.read_csv('./data/train_internet.csv')
test_data = pd.read_csv('./data/train_public.csv')

In [85]:
train_data.shape

(750000, 42)

In [86]:
test_data.shape

(10000, 39)

In [87]:
feature_list = []

train_data.rename(columns={'is_default': 'isDefault'}, inplace=True)

for feature in train_data.columns:
    if feature in test_data.columns:
        feature_list.append(feature)

In [88]:
feature_list

['loan_id',
 'user_id',
 'total_loan',
 'year_of_loan',
 'interest',
 'monthly_payment',
 'class',
 'employer_type',
 'industry',
 'work_year',
 'house_exist',
 'censor_status',
 'issue_date',
 'use',
 'post_code',
 'region',
 'debt_loan_ratio',
 'del_in_18month',
 'scoring_low',
 'scoring_high',
 'pub_dero_bankrup',
 'early_return',
 'early_return_amount',
 'early_return_amount_3mon',
 'recircle_b',
 'recircle_u',
 'initial_list_status',
 'earlies_credit_mon',
 'title',
 'policy_code',
 'f0',
 'f1',
 'f2',
 'f3',
 'f4',
 'isDefault']

In [89]:
train_data.head()

Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,sub_class,work_type,employer_type,...,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,f5,isDefault
0,119262,0,12000.0,5,11.53,264.1,B,B5,职员,普通企业,...,Mar-1984,0.0,1.0,1.0,0.0,8.0,17.0,8.0,1.0,1
1,369815,1,8000.0,3,13.98,273.35,C,C3,其他,普通企业,...,Jan-1992,94.0,1.0,,,,,,,0
2,787833,2,20000.0,5,17.99,507.76,D,D2,工人,上市企业,...,Oct-1996,0.0,1.0,6.0,0.0,10.0,8.0,3.0,0.0,0
3,671675,3,10700.0,3,10.16,346.07,B,B1,职员,普通企业,...,Jul-2000,41646.0,1.0,3.0,0.0,4.0,11.0,6.0,0.0,0
4,245160,4,8000.0,3,8.24,251.58,B,B1,其他,政府机构,...,Mar-2000,4.0,1.0,3.0,0.0,8.0,6.0,4.0,1.0,0


In [90]:
train_data = train_data.loc[:, feature_list]
test_data = test_data.loc[:, feature_list]

In [91]:
train_data.head()

Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,...,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,isDefault
0,119262,0,12000.0,5,11.53,264.1,B,普通企业,采矿业,,...,0,Mar-1984,0.0,1.0,1.0,0.0,8.0,17.0,8.0,1
1,369815,1,8000.0,3,13.98,273.35,C,普通企业,国际组织,10+ years,...,1,Jan-1992,94.0,1.0,,,,,,0
2,787833,2,20000.0,5,17.99,507.76,D,上市企业,信息传输、软件和信息技术服务业,10+ years,...,0,Oct-1996,0.0,1.0,6.0,0.0,10.0,8.0,3.0,0
3,671675,3,10700.0,3,10.16,346.07,B,普通企业,电力、热力生产供应业,2 years,...,0,Jul-2000,41646.0,1.0,3.0,0.0,4.0,11.0,6.0,0
4,245160,4,8000.0,3,8.24,251.58,B,政府机构,金融业,5 years,...,1,Mar-2000,4.0,1.0,3.0,0.0,8.0,6.0,4.0,0


In [92]:
train_data.dtypes

loan_id                       int64
user_id                       int64
total_loan                  float64
year_of_loan                  int64
interest                    float64
monthly_payment             float64
class                        object
employer_type                object
industry                     object
work_year                    object
house_exist                   int64
censor_status                 int64
issue_date                   object
use                           int64
post_code                   float64
region                        int64
debt_loan_ratio             float64
del_in_18month              float64
scoring_low                 float64
scoring_high                float64
pub_dero_bankrup            float64
early_return                  int64
early_return_amount           int64
early_return_amount_3mon    float64
recircle_b                  float64
recircle_u                  float64
initial_list_status           int64
earlies_credit_mon          

In [93]:
work_year_dict = {
    '< 1 year': 1,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10,
}

train_data['work_year'] = train_data['work_year'].map(work_year_dict)
test_data['work_year'] = test_data['work_year'].map(work_year_dict)

In [94]:
class_dict = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,
}

train_data['class'] = train_data['class'].map(class_dict)
test_data['class'] = test_data['class'].map(class_dict)

In [95]:
train_data['issue_date']

0         2015-06-01
1         2010-10-01
2         2016-08-01
3         2013-05-01
4         2017-04-01
             ...    
749995    2016-02-01
749996    2014-03-01
749997    2015-12-01
749998    2017-12-01
749999    2013-12-01
Name: issue_date, Length: 750000, dtype: object

In [96]:
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_data['issue_date'] = pd.to_datetime(test_data['issue_date'])

train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_data['issue_date_month'] = test_data['issue_date'].dt.month

train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_data['issue_date_dayofweek'] = test_data['issue_date'].dt.dayofweek

In [97]:
filt = train_data.isna().any(axis=1)
len(train_data.loc[filt, :])

106678

In [98]:
col_to_drop = ['issue_date', 'earlies_credit_mon']
train_data = train_data.drop(col_to_drop, axis=1)
test_data = test_data.drop(col_to_drop, axis=1)

In [99]:
cat_cols = ['employer_type', 'industry']

from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    lbl = LabelEncoder().fit(train_data[col])
    train_data[col] = lbl.transform(train_data[col])
    test_data[col] = lbl.transform(test_data[col])

In [100]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def k_fold_serachParmaters(model, train_data, train_label, test_data):
    n_splits=5
    
    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
    pred_Test = np.zeros(len(test_data))
    
    auc_train, auc_val = 0, 0
    for tr_idx, val_idx in sk.split(train_data, train_label):
        x_train = train_data.iloc[tr_idx]
        y_train = train_label.iloc[tr_idx]
        x_val = train_data.iloc[val_idx]
        y_val = train_label.iloc[val_idx]

        model.fit(x_train, y_train, 
                  eval_set=[(x_val, y_val)], 
                  categorical_feature = cat_cols,
                 early_stopping_rounds=100,
                 verbose=False)

        pred_Test += model.predict_proba(test_data)[:, 1]/n_splits

        pred = model.predict(x_val)
        auc_val += roc_auc_score(y_val,pred)/n_splits
        
        pred = model.predict(x_train)
        auc_train += roc_auc_score(y_train, pred)/n_splits
        
        
    return auc_val, pred_Test

In [102]:
import warnings
warnings.filterwarnings("ignore")

score_tta = None
score_list = []

tta_fold = 5
for _ in range(tta_fold):
    clf = lgb.LGBMClassifier(
        num_leaves=np.random.randint(6, 10), min_child_samples= np.random.randint(2,5),
        max_depth=7,learning_rate=0.01,
        n_estimators=2050,n_jobs=-1)

    score, test_pred = k_fold_serachParmaters(clf,
                           train_data.drop(['loan_id', 'user_id', 'isDefault'], axis=1),
                           train_data['isDefault'],
                           test_data.drop(['loan_id', 'user_id', 'isDefault'], axis=1),
                          )

    print(score)
    if score_tta is None:
        score_tta = test_pred/tta_fold
    else:
        score_tta += test_pred/tta_fold
    score_list.append(score)

0.593489049986057
0.5915161974497998
0.592665961229029
0.5903180712986229
0.5903431668736872


In [103]:
df_out = pd.read_csv('./data/train_public.csv')
df_out['pred_default'] = score_tta

In [104]:
df_out.to_csv('pred.csv', index=None)