In [41]:
import pandas as pd
pd.set_option('display.max_columns', 30)

import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
df = pd.read_csv('hmeq_test.csv')
df

Unnamed: 0,id,loan_amount,mortgage_amount,property_value,loan_reason,occupation,occupation_length,derogatory_reports,late_payments,oldest_credit_line,recent_credit,credit_number,ratio
0,12816,28100,32470.0,54522.0,DebtCon,Other,35.0,0.0,0.0,295.496684,0.0,23.0,38.799483
1,11678,10500,57880.0,66802.0,DebtCon,Other,2.0,0.0,0.0,94.598425,8.0,21.0,45.464766
2,13568,10900,,76207.0,,,,,,,,,22.392612
3,16525,50000,44000.0,59000.0,DebtCon,Office,12.0,0.0,1.0,102.500000,2.0,26.0,
4,16160,7500,,26850.0,HomeImp,Office,20.0,0.0,0.0,185.633333,1.0,16.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,16196,25000,82000.0,128325.0,DebtCon,Mgr,,0.0,1.0,10.133333,,41.0,
758,16461,23400,89226.0,119604.0,DebtCon,Self,3.0,0.0,0.0,221.681383,1.0,26.0,34.556201
759,16619,15300,21383.0,38480.0,HomeImp,Mgr,20.0,0.0,1.0,182.600000,4.0,31.0,
760,12593,8000,76600.0,96000.0,DebtCon,Mgr,2.0,0.0,0.0,73.033333,2.0,24.0,


##### **Data preprocessing**

In [43]:
list_id = df['id']
df = df.drop(columns={'id'})
df

Unnamed: 0,loan_amount,mortgage_amount,property_value,loan_reason,occupation,occupation_length,derogatory_reports,late_payments,oldest_credit_line,recent_credit,credit_number,ratio
0,28100,32470.0,54522.0,DebtCon,Other,35.0,0.0,0.0,295.496684,0.0,23.0,38.799483
1,10500,57880.0,66802.0,DebtCon,Other,2.0,0.0,0.0,94.598425,8.0,21.0,45.464766
2,10900,,76207.0,,,,,,,,,22.392612
3,50000,44000.0,59000.0,DebtCon,Office,12.0,0.0,1.0,102.500000,2.0,26.0,
4,7500,,26850.0,HomeImp,Office,20.0,0.0,0.0,185.633333,1.0,16.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
757,25000,82000.0,128325.0,DebtCon,Mgr,,0.0,1.0,10.133333,,41.0,
758,23400,89226.0,119604.0,DebtCon,Self,3.0,0.0,0.0,221.681383,1.0,26.0,34.556201
759,15300,21383.0,38480.0,HomeImp,Mgr,20.0,0.0,1.0,182.600000,4.0,31.0,
760,8000,76600.0,96000.0,DebtCon,Mgr,2.0,0.0,0.0,73.033333,2.0,24.0,


In [44]:
df.isnull().sum()

loan_amount             0
mortgage_amount        67
property_value         17
loan_reason            29
occupation             38
occupation_length      65
derogatory_reports     89
late_payments          77
oldest_credit_line     41
recent_credit          74
credit_number          26
ratio                 154
dtype: int64

Filling missing values

In [45]:
# loan_reason
df['loan_reason'] = df['loan_reason'].fillna('Unknown')

# occupation
occupation_mode = df['occupation'].mode()[0]
df['occupation'] = df['occupation'].fillna(occupation_mode)

In [46]:
cols_null_columns = df.select_dtypes(exclude='object').columns

for col in cols_null_columns:
    col_mode = df[col].mode()[0]
    df[col] = df[col].fillna(col_mode)
df.isnull().sum()

loan_amount           0
mortgage_amount       0
property_value        0
loan_reason           0
occupation            0
occupation_length     0
derogatory_reports    0
late_payments         0
oldest_credit_line    0
recent_credit         0
credit_number         0
ratio                 0
dtype: int64

Encode data with mapping file

In [47]:
with open('encoding_mapping.json', 'r') as json_file:
    encoding_mapping = json.load(json_file)
    
for col in encoding_mapping:
    df[col] = df[col].map(encoding_mapping[col])
df

Unnamed: 0,loan_amount,mortgage_amount,property_value,loan_reason,occupation,occupation_length,derogatory_reports,late_payments,oldest_credit_line,recent_credit,credit_number,ratio
0,28100,32470.0,54522.0,0,2,35.0,0.0,0.0,295.496684,0.0,23.0,38.799483
1,10500,57880.0,66802.0,0,2,2.0,0.0,0.0,94.598425,8.0,21.0,45.464766
2,10900,18000.0,76207.0,2,2,2.0,0.0,0.0,73.033333,0.0,19.0,22.392612
3,50000,44000.0,59000.0,0,1,12.0,0.0,1.0,102.500000,2.0,26.0,1.909225
4,7500,18000.0,26850.0,1,1,20.0,0.0,0.0,185.633333,1.0,16.0,1.909225
...,...,...,...,...,...,...,...,...,...,...,...,...
757,25000,82000.0,128325.0,0,0,2.0,0.0,1.0,10.133333,0.0,41.0,1.909225
758,23400,89226.0,119604.0,0,5,3.0,0.0,0.0,221.681383,1.0,26.0,34.556201
759,15300,21383.0,38480.0,1,0,20.0,0.0,1.0,182.600000,4.0,31.0,1.909225
760,8000,76600.0,96000.0,0,0,2.0,0.0,0.0,73.033333,2.0,24.0,1.909225


Scale data

In [48]:
x_test = df

import joblib
mms = joblib.load('minmaxscaler.pkl')

# Transform test data using the loaded scaler
x_test = mms.transform(x_test)

##### **Load model**

XGBoost model

In [49]:
import pickle

# Load the model
with open('xgboost_model.pkl', 'rb') as file:
    xgb_model = pickle.load(file)

ANN model

In [50]:
from tensorflow import keras

# Load the model
ann_model = keras.models.load_model('ann_model.keras')

##### **Make predictions**

In [56]:


# Assuming y_sub is obtained from your model
y_sub = ann_model.predict(x_test)

# Flatten the predictions and round them
y_sub = np.round(y_sub.flatten(), 3)
y_pred = (y_sub > 0.5).astype("int32")

# Create the DataFrame
df = pd.DataFrame({'id': list_id, 'default': y_pred})
df.to_csv('submission.csv', index=False)


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 737us/step
