In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from supervised.automl import AutoML

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

import sys
sys.path.append('../') # to import function from another folder

from src.utils import rename_column, rename_columns

import src.dataset as dataset

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
CREDIT_RISK_DATASET = '../data/raw/credit_risk_dataset.csv'
LABELS = 'loan_status'
MISSING_VALUE_HANDLING = 'fill_zero'

categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

df = pd.read_csv(CREDIT_RISK_DATASET)

df = df.drop_duplicates(keep='last')

In [3]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [4]:
# split data before filling NaN & vectorizing to avoid data leak
# stratify to make sure we get all of the categorical variables on train set
X = df.drop(columns=LABELS)
y = df[LABELS]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=46)

X_train = dataset.clean_data('credit_risk', X_train, LABELS, MISSING_VALUE_HANDLING)
X_test = dataset.clean_data('credit_risk', X_test, LABELS, MISSING_VALUE_HANDLING)
print(X_train.shape[0])
print(y_train.shape[0])

25932
25932


In [6]:
# autoML
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

automl = AutoML()
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.522131 trained in 0.34 seconds




2_DecisionTree logloss 0.322163 trained in 34.91 seconds
* Step default_algorithms will try to check up to 3 models




3_Default_Xgboost logloss 0.186951 trained in 10.06 seconds


 -0.27598761]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 -0.49905251]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 -1.07269995]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 -0.93671195]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 -0.05562857]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 -0.29571537]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  0.9891076 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 -0.50619467]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
 -0.35355339]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  2.10828101]' has dtype incompatible with int64, please explicitly cast to a comp

4_Default_NeuralNetwork logloss 0.244421 trained in 7.46 seconds




5_Default_RandomForest logloss 0.284042 trained in 8.55 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.186951 trained in 2.84 seconds
AutoML fit time: 74.04 seconds
AutoML best model: 3_Default_Xgboost




In [7]:
print(accuracy_score(y_test, predictions.astype(int)))
print(recall_score(y_test, predictions.astype(int)))
print(precision_score(y_test, predictions.astype(int)))

# Apakah autoML ini sudah menghandle data imbalance?
# solving: 1. resampling, 2. bootstrap

0.932449105490438
0.717948717948718
0.981549815498155


In [8]:
print(roc_auc_score(y_test, automl.predict_proba(X_test)[:, 1]))

0.9520754451255935


