# 1. Import and Load Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler

import joblib

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 5   DebtRatio                             150000 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 8   NumberOfTimes90DaysLate               150000 non-null  int64  
 9   NumberRealEstateLoansOrLines          150000 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 11  

In [4]:
df = df.drop(['Unnamed: 0'], axis=1)

# 2. Data Preprocessing

In [5]:
X = df.drop(['SeriousDlqin2yrs'], axis=1)
y = df['SeriousDlqin2yrs']

In [6]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

2.1 Impute Null values

In [7]:
X.isna().sum()

RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [8]:
median_imputer_train = SimpleImputer(missing_values = np.nan,
                                   strategy = "median")

median_imputer_train.fit(X_train[['MonthlyIncome']])


median_imputer_test = SimpleImputer(missing_values = np.nan,
                                   strategy = "median")

median_imputer_test.fit(X_test[['MonthlyIncome']])

In [9]:
X_train['MonthlyIncome'] = median_imputer_train.transform(X_train[['MonthlyIncome']])

X_test['MonthlyIncome'] = median_imputer_test.transform(X_test[['MonthlyIncome']])

In [10]:
X_train.loc[X_train['NumberOfDependents'].isnull(), 'NumberOfDependents'] = 0.0
X_test.loc[X_test['NumberOfDependents'].isnull(), 'NumberOfDependents'] = 0.0

In [11]:
X_train.isna().sum()

RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

2.2 Standard Scaling

In [12]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [13]:
joblib.dump(scaler, 'std_scaler.pkl')

# scaler_load = joblib.load("std_scaler.pkl")

['std_scaler.pkl']

2.3 Undersampling

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split( X_train_std, y_train, test_size=0.2, random_state=42)

In [15]:
ros = RandomUnderSampler(random_state = 42)

In [16]:
X_resample, y_resample = ros.fit_resample(X_train, y_train)

In [17]:
y_resample.value_counts()

0    5706
1    5706
Name: SeriousDlqin2yrs, dtype: int64

# 3. Model Creation

In [18]:
knn = KNeighborsClassifier()
lgr = LogisticRegression(solver='liblinear')
xgb = XGBClassifier()
rf = RandomForestClassifier()

# Create list of model
list_of_model = [
    {'model_name': knn.__class__.__name__, 'model_object': knn},
    {'model_name': lgr.__class__.__name__, 'model_object': lgr},
    {'model_name': xgb.__class__.__name__, 'model_object': xgb},
    {'model_name': rf.__class__.__name__, 'model_object': rf}
]

# 'n_neighbors': [50, 100, 200]
knn_params = {
    'n_neighbors': [100],
}

# 'penalty': ['l1', 'l2'],
# 'C': [0.01, 0.1],
# 'max_iter': [100, 300, 500]
lgr_params = {
    'penalty': ['l2'],
    'C': [0.01],
    'max_iter': [300]
}

# 'n_estimators': [5, 10, 25, 50]
xgb_params = {
    'n_estimators': [10]
}

# 'n_estimators': [5, 10, 25, 50]
rf_params = {
    'n_estimators': [10]
}

# Create model params
list_of_param = {
    'KNeighborsClassifier': knn_params,
    'LogisticRegression': lgr_params,
    'XGBClassifier': xgb_params,
    'RandomForestClassifier': rf_params
}

In [19]:
def get_best_model(list_of_model, list_of_param):
    
    best_model_obj = None
    best_model_name = None
    best_score = 0
    
    for model_dict in list_of_model:
        
        print(model_dict)
        
        model_obj = model_dict['model_object']
        model_param = list_of_param[model_dict['model_name']]
        
        model = RandomizedSearchCV(estimator = model_obj,
                            param_distributions = model_param,
                            n_iter=5,
                            cv = 5,
                            random_state = 123,
                            n_jobs=1,
                            verbose=10,
                            scoring = 'roc_auc')
        
        model.fit(X_train, y_train)
        
        y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
        valid_score = roc_auc_score(y_valid, y_pred_proba_valid)
        
        if valid_score > best_score:
            best_score = valid_score
            best_model_obj = model
            best_model_name = model_dict['model_name']
    
    print(f"""
          ==============================
          BEST MODEL        : {best_model_name}
          BEST SCORE        : {best_score}
          BEST MODEL OBJ    : {best_model_obj}
          ==============================
          """)
    
    #save your model or results
    joblib.dump(best_model_obj, 'best_model.pkl')
    
    print('Model saved !')

In [20]:
get_best_model(list_of_model, list_of_param)

{'model_name': 'KNeighborsClassifier', 'model_object': KNeighborsClassifier()}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START n_neighbors=100.............................................




[CV 1/5; 1/1] END ..............n_neighbors=100;, score=0.766 total time=  21.6s
[CV 2/5; 1/1] START n_neighbors=100.............................................
[CV 2/5; 1/1] END ..............n_neighbors=100;, score=0.760 total time=  10.3s
[CV 3/5; 1/1] START n_neighbors=100.............................................
[CV 3/5; 1/1] END ..............n_neighbors=100;, score=0.772 total time=  11.0s
[CV 4/5; 1/1] START n_neighbors=100.............................................
[CV 4/5; 1/1] END ..............n_neighbors=100;, score=0.763 total time=  15.3s
[CV 5/5; 1/1] START n_neighbors=100.............................................
[CV 5/5; 1/1] END ..............n_neighbors=100;, score=0.751 total time=  17.3s
{'model_name': 'LogisticRegression', 'model_object': LogisticRegression(solver='liblinear')}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START C=0.01, max_iter=300, penalty=l2............................




[CV 1/5; 1/1] END C=0.01, max_iter=300, penalty=l2;, score=0.695 total time=   0.3s
[CV 2/5; 1/1] START C=0.01, max_iter=300, penalty=l2............................
[CV 2/5; 1/1] END C=0.01, max_iter=300, penalty=l2;, score=0.691 total time=   0.2s
[CV 3/5; 1/1] START C=0.01, max_iter=300, penalty=l2............................
[CV 3/5; 1/1] END C=0.01, max_iter=300, penalty=l2;, score=0.698 total time=   0.3s
[CV 4/5; 1/1] START C=0.01, max_iter=300, penalty=l2............................
[CV 4/5; 1/1] END C=0.01, max_iter=300, penalty=l2;, score=0.694 total time=   0.2s
[CV 5/5; 1/1] START C=0.01, max_iter=300, penalty=l2............................
[CV 5/5; 1/1] END C=0.01, max_iter=300, penalty=l2;, score=0.698 total time=   0.3s
{'model_name': 'XGBClassifier', 'model_object': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_



[CV 1/5; 1/1] END ..............n_estimators=10;, score=0.866 total time=   0.6s
[CV 2/5; 1/1] START n_estimators=10.............................................
[CV 2/5; 1/1] END ..............n_estimators=10;, score=0.857 total time=   0.6s
[CV 3/5; 1/1] START n_estimators=10.............................................
[CV 3/5; 1/1] END ..............n_estimators=10;, score=0.864 total time=   0.6s
[CV 4/5; 1/1] START n_estimators=10.............................................
[CV 4/5; 1/1] END ..............n_estimators=10;, score=0.853 total time=   0.7s
[CV 5/5; 1/1] START n_estimators=10.............................................
[CV 5/5; 1/1] END ..............n_estimators=10;, score=0.847 total time=   0.6s
{'model_name': 'RandomForestClassifier', 'model_object': RandomForestClassifier()}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START n_estimators=10.............................................




[CV 1/5; 1/1] END ..............n_estimators=10;, score=0.787 total time=   3.4s
[CV 2/5; 1/1] START n_estimators=10.............................................
[CV 2/5; 1/1] END ..............n_estimators=10;, score=0.785 total time=   3.4s
[CV 3/5; 1/1] START n_estimators=10.............................................
[CV 3/5; 1/1] END ..............n_estimators=10;, score=0.778 total time=   3.7s
[CV 4/5; 1/1] START n_estimators=10.............................................
[CV 4/5; 1/1] END ..............n_estimators=10;, score=0.782 total time=   3.5s
[CV 5/5; 1/1] START n_estimators=10.............................................
[CV 5/5; 1/1] END ..............n_estimators=10;, score=0.776 total time=   2.7s

          BEST MODEL        : XGBClassifier
          BEST SCORE        : 0.8588281190689482
          BEST MODEL OBJ    : RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callb

In [25]:
#load your model for further usage
best_model = joblib.load("best_model.pkl")

In [26]:
best_model

In [27]:

y_pred_proba_test = best_model.predict_proba(X_test)[:, 1]

test_score = roc_auc_score(y_test, y_pred_proba_test)

In [28]:
test_score

0.7681100283022785