## 1. Preparasi

- Modelling kali ini saya menggunakan 3 model yaitu Logreg untuk baseline model dan LightGBM dan XGBoost.

In [11]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
#buat fungsi untuk reduce memory.
#intinya adalah iterasi keseluruh kolom pada dataframe dan mengubah datatype yg memorynya paling kecil.
def reduce_mem_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

#fungsi untuk memanggil dataframe
def import_data(file):
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [3]:
data = import_data('G:\\Bootcamp\\Materi\\28. NoSQL\\home credit\\data\\data_final.csv')

Memory usage of dataframe is 834.43 MB
Memory usage after optimization is: 226.95 MB
Decreased by 72.8%


In [4]:
data.head()

Unnamed: 0.2,Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,Unnamed: 0.1,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,TARGET
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,0,64815.0,51934,0.972168,0.972168,0.972168,0.619141,0.624512,0.634277,1.0
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,1,9253.0,50714,0.984863,0.984863,0.984863,0.795898,0.798828,0.804199,0.0
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,2,64815.0,50714,0.981445,0.981445,0.981445,0.755371,0.758301,0.764648,0.0
3,3,29686.5,312682.5,297000.0,135000.0,0.0,0.0,0.0,0.0,0.0,...,3,64815.0,51934,0.981445,0.981445,0.981445,0.755371,0.758301,0.764648,0.0
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,4,64815.0,50591,0.981445,0.981445,0.981445,0.755371,0.758301,0.764648,0.0


In [5]:
data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)

In [6]:
data.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,TARGET
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0149,64815.0,51934,0.972168,0.972168,0.972168,0.619141,0.624512,0.634277,1.0
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.071411,9253.0,50714,0.984863,0.984863,0.984863,0.795898,0.798828,0.804199,0.0
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06897,64815.0,50714,0.981445,0.981445,0.981445,0.755371,0.758301,0.764648,0.0
3,29686.5,312682.5,297000.0,135000.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.06897,64815.0,51934,0.981445,0.981445,0.981445,0.755371,0.758301,0.764648,0.0
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06897,64815.0,50591,0.981445,0.981445,0.981445,0.755371,0.758301,0.764648,0.0


In [7]:
#split ke data training dan testing
data_train = data[:307511]
data_test = data[307511:]

In [8]:
data_train.shape

(307511, 305)

In [9]:
data_test.shape

(48744, 305)

## 2. Balancing Data
- Menggunakan Undersampling random under sampling.

In [12]:
x = data_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y = data_train['TARGET']

In [13]:
x.shape

(307511, 303)

In [14]:
y.shape

(307511,)

In [15]:
rus = RandomUnderSampler()
x_resample, y_resample = rus.fit_sample(x,y)

In [16]:
print(x_resample.shape)
print(y_resample.shape)

(49650, 303)
(49650,)


## 3. Rescaling Data

- agar data memiliki satuan yang sama

In [17]:
scaler = MinMaxScaler()

In [18]:
x_resample = scaler.fit_transform(x_resample)

## 4. Tuning XGBoost

In [19]:
#data testing
data_test_2 = data_test.copy()

#drop unused columns
data_test_2.drop(columns=['SK_ID_CURR', 'TARGET'], inplace=True)

#rescale
x_test = scaler.fit_transform(data_test_2)

In [28]:
xgb = XGBClassifier(
        objective="binary:logistic",
        booster="gbtree",
        eval_metric = "auc",
        nthread = 4,
        eta = 0.05,
        colsample_bylevel = 0.675,
        alpha = 0,
        random_state = 42,
        nrounds = 2000,
        learning_rate = 0.02,
        n_estimators = 600,
        silent = True
    )

In [29]:
#Parameter yang akan dituning
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [30]:
folds = 5
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle =True)

In [31]:
xgb_tuning = RandomizedSearchCV(xgb,
                                param_distributions=params,
                                n_iter=param_comb,
                                scoring='roc_auc',
                                n_jobs=4,
                                cv=skf.split(x_resample, y_resample),
                                verbose=3)

In [32]:
xgb_tuning.fit(x_resample, y_resample)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 43.4min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x0000028702AACF48>,
                   error_score='raise-deprecating',
                   estimator=XGBClassifier(alpha=0, base_score=0.5,
                                           booster='gbtree',
                                           colsample_bylevel=0.675,
                                           colsample_bynode=1,
                                           colsample_bytree=1, eta=0.05,
                                           eval_metric='auc', gamma=0,
                                           learning_rate=0.02, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n...
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=True, subsample=1,
                                           verbosity=1),
                   iid='warn',

In [33]:
y_pred_3 = xgb_tuning.predict_proba(x_test)[:,1]

In [34]:
submission_3 = data_test[['SK_ID_CURR']]
submission_3['TARGET'] = y_pred_3
submission_3.head()

Unnamed: 0,SK_ID_CURR,TARGET
307511,100001,0.410861
307512,100005,0.688241
307513,100013,0.137997
307514,100028,0.37944
307515,100038,0.677512


In [35]:
submission_3.to_csv('XGB_tuning_submission.csv', index=False)

- XGBoost model sebelum tuning didapatkan score AUC 0.72373 dari kaggle.
- Setelah tuning didapatkan score AUC 0.73113 dari kaggle.