In [1]:
%%time

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

CPU times: user 1.13 s, sys: 163 ms, total: 1.29 s
Wall time: 1.71 s


In [2]:
%%time

train = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e10/sample_submission.csv')

CPU times: user 146 ms, sys: 31 ms, total: 177 ms
Wall time: 246 ms


In [3]:
%%time

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [4]:
%%time

# check missing values

missing_values = train.isnull().sum()
missing_values

CPU times: user 23.5 ms, sys: 1.08 ms, total: 24.5 ms
Wall time: 23.6 ms


id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [5]:
%%time

import h2o
from h2o.automl import H2OAutoML

# Initialize the H2O cluster
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.24" 2024-07-16; OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu322.04); OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu322.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp6gjgtm7x
  JVM stdout: /tmp/tmp6gjgtm7x/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp6gjgtm7x/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 7 days
H2O_cluster_name:,H2O_from_python_unknownUser_k0i3a3
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.250 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


CPU times: user 366 ms, sys: 82.7 ms, total: 449 ms
Wall time: 7.55 s


In [6]:
%%time

# Converting training data to H2O Frame
train_h2o = h2o.H2OFrame(train)
test_h2o = h2o.H2OFrame(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
CPU times: user 1.05 s, sys: 55.4 ms, total: 1.11 s
Wall time: 4.73 s


In [7]:
%%time

# Define the target and features
target = 'loan_status'
features = [col for col in train_h2o.columns if col not in ['id', target]]

CPU times: user 17 µs, sys: 2 µs, total: 19 µs
Wall time: 24.1 µs


In [8]:
%%time

# Convert train categorical columns are set as factors
train_h2o['person_home_ownership'] = train_h2o['person_home_ownership'].asfactor()
train_h2o['loan_intent'] = train_h2o['loan_intent'].asfactor()
train_h2o['loan_grade'] = train_h2o['loan_grade'].asfactor()
train_h2o['cb_person_default_on_file'] = train_h2o['cb_person_default_on_file'].asfactor()
train_h2o['loan_status'] = train_h2o['loan_status'].asfactor()  # Target column

# Convert test categorical columns to factors
test_h2o['person_home_ownership'] = test_h2o['person_home_ownership'].asfactor()
test_h2o['loan_intent'] = test_h2o['loan_intent'].asfactor()
test_h2o['loan_grade'] = test_h2o['loan_grade'].asfactor()
test_h2o['cb_person_default_on_file'] = test_h2o['cb_person_default_on_file'].asfactor()

CPU times: user 1.31 ms, sys: 0 ns, total: 1.31 ms
Wall time: 1.31 ms


In [9]:
%%time

# Split the data into training and validation sets
train, valid = train_h2o.split_frame(ratios=[0.8], seed=42)

CPU times: user 274 ms, sys: 1.93 ms, total: 276 ms
Wall time: 821 ms


In [10]:
%%time

# Initializing H2O AutoML
aml = H2OAutoML(max_runtime_secs=1000, seed=42, nfolds=5, sort_metric='AUC')

CPU times: user 3.84 ms, sys: 133 µs, total: 3.97 ms
Wall time: 33.8 ms


In [11]:
%%time

# Train the AutoML model
aml.train(x=features, y=target, training_frame=train, validation_frame=valid)

AutoML progress: |
00:34:13.19: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.

███████████████████████████████████████████████████████████████| (done) 100%
CPU times: user 7.19 s, sys: 622 ms, total: 7.81 s
Wall time: 16min 41s


key,value
Stacking strategy,cross_validation
Number of base models (used / total),13/42
# GBM base models (used / total),1/16
# XGBoost base models (used / total),11/20
# DRF base models (used / total),1/2
# DeepLearning base models (used / total),0/3
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,0,1,Error,Rate
0,8466.0,84.0,0.0098,(84.0/8550.0)
1,86.0,1320.0,0.0612,(86.0/1406.0)
Total,8552.0,1404.0,0.0171,(170.0/9956.0)

metric,threshold,value,idx
max f1,0.2493997,0.9395018,204.0
max f2,0.2026558,0.9559132,219.0
max f0point5,0.3127741,0.955107,187.0
max accuracy,0.2556594,0.9829249,202.0
max precision,0.9777671,1.0,0.0
max recall,0.0944291,1.0,280.0
max specificity,0.9777671,1.0,0.0
max absolute_mcc,0.2493997,0.9295617,204.0
max min_per_class_accuracy,0.1901635,0.9765292,224.0
max mean_per_class_accuracy,0.1791181,0.977713,228.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100442,0.9391515,7.0810811,7.0810811,1.0,0.9551832,1.0,0.9551832,0.0711238,0.0711238,608.1081081,608.1081081,0.0711238
2,0.0200884,0.9133318,7.0810811,7.0810811,1.0,0.9258773,1.0,0.9405303,0.0711238,0.1422475,608.1081081,608.1081081,0.1422475
3,0.0300321,0.8938108,7.0810811,7.0810811,1.0,0.9037652,1.0,0.9283572,0.0704125,0.21266,608.1081081,608.1081081,0.21266
4,0.0400763,0.8749104,7.0810811,7.0810811,1.0,0.8842599,1.0,0.9173053,0.0711238,0.2837838,608.1081081,608.1081081,0.2837838
5,0.0500201,0.8523365,7.0810811,7.0810811,1.0,0.8637169,1.0,0.9066521,0.0704125,0.3541963,608.1081081,608.1081081,0.3541963
6,0.1000402,0.6072496,7.0810811,7.0810811,1.0,0.7640212,1.0,0.8353367,0.3541963,0.7083926,608.1081081,608.1081081,0.7083926
7,0.1500603,0.2114766,5.0904157,6.417526,0.7188755,0.3707162,0.9062918,0.6804632,0.254623,0.9630156,409.0415717,541.752596,0.9466414
8,0.2000804,0.131228,0.6114186,4.9659991,0.0863454,0.1592059,0.7013052,0.5501489,0.0305832,0.9935989,-38.8581352,396.5999132,0.9240082
9,0.3000201,0.0868592,0.06405,3.3331101,0.0090452,0.1050839,0.4707064,0.4018932,0.0064011,1.0,-93.595002,233.3110144,0.8150877
10,0.4000603,0.063765,0.0,2.4996234,0.0,0.0741501,0.3530003,0.3199369,0.0,1.0,-100.0,149.9623399,0.6985965

Unnamed: 0,0,1,Error,Rate
0,9939.0,128.0,0.0127,(128.0/10067.0)
1,404.0,1262.0,0.2425,(404.0/1666.0)
Total,10343.0,1390.0,0.0453,(532.0/11733.0)

metric,threshold,value,idx
max f1,0.3336836,0.8259162,173.0
max f2,0.20749,0.8093824,211.0
max f0point5,0.5413096,0.8923643,135.0
max accuracy,0.3895422,0.9548283,161.0
max precision,0.9845333,1.0,0.0
max recall,0.0139572,1.0,385.0
max specificity,0.9845333,1.0,0.0
max absolute_mcc,0.3336836,0.8044326,173.0
max min_per_class_accuracy,0.1293621,0.8913565,253.0
max mean_per_class_accuracy,0.1445585,0.8921622,243.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100571,0.9298896,7.042617,7.042617,1.0,0.9501605,1.0,0.9501605,0.0708283,0.0708283,604.2617047,604.2617047,0.0708283
2,0.020029,0.9039786,6.9222304,6.9826799,0.982906,0.9160655,0.9914894,0.9331856,0.0690276,0.1398559,592.2230431,598.267988,0.1396573
3,0.0300009,0.874924,6.8620371,6.9425799,0.974359,0.8893265,0.9857955,0.9186074,0.0684274,0.2082833,586.2037123,594.2579873,0.2077866
4,0.040058,0.8510165,6.8635675,6.9227427,0.9745763,0.8631749,0.9829787,0.9046903,0.0690276,0.2773109,586.3567461,592.2742714,0.2765162
5,0.0500298,0.8222035,6.7416505,6.8866477,0.957265,0.8362335,0.9778535,0.8910456,0.0672269,0.3445378,574.1650506,588.6647674,0.3432465
6,0.1000597,0.5873215,6.6706901,6.7786689,0.9471891,0.7371791,0.9625213,0.8141124,0.3337335,0.6782713,567.0690082,577.8668878,0.6739006
7,0.1500043,0.2053422,2.8122396,5.4580282,0.3993174,0.324227,0.775,0.6510028,0.1404562,0.8187275,181.2239572,445.8028211,0.779391
8,0.2000341,0.1416484,1.0677903,4.3600011,0.1516184,0.1677575,0.6190882,0.53014,0.0534214,0.8721489,6.7790319,336.0001095,0.7833439
9,0.3000085,0.0923463,0.5943897,3.1051539,0.084399,0.1137295,0.4409091,0.3913759,0.0594238,0.9315726,-40.5610326,210.515388,0.7360824
10,0.399983,0.0687148,0.2761811,2.3980614,0.0392157,0.0795866,0.3405071,0.3134452,0.027611,0.9591837,-72.3818939,139.8061377,0.6517435

Unnamed: 0,0,1,Error,Rate
0,39724.0,504.0,0.0125,(504.0/40228.0)
1,1752.0,4932.0,0.2621,(1752.0/6684.0)
Total,41476.0,5436.0,0.0481,(2256.0/46912.0)

metric,threshold,value,idx
max f1,0.4567159,0.8138614,152.0
max f2,0.1843579,0.8004225,235.0
max f0point5,0.7084342,0.8845483,104.0
max accuracy,0.6122419,0.9525495,121.0
max precision,0.9981597,1.0,0.0
max recall,0.0020267,1.0,396.0
max specificity,0.9981597,1.0,0.0
max absolute_mcc,0.6058094,0.7939138,122.0
max min_per_class_accuracy,0.0915675,0.8828546,286.0
max mean_per_class_accuracy,0.1149466,0.8859462,270.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100188,0.9928515,6.9737525,6.9737525,0.993617,0.9956828,0.993617,0.9956828,0.0698683,0.0698683,597.3752499,597.3752499,0.0697938
2,0.0200162,0.9878323,6.9137973,6.9438068,0.9850746,0.9903236,0.9893504,0.9930061,0.0691203,0.1389886,591.3797261,594.3806805,0.13874
3,0.0300136,0.9829346,6.8838674,6.9238412,0.9808102,0.9854185,0.9865057,0.9904787,0.0688211,0.2078097,588.3867403,592.3841195,0.2073374
4,0.0400111,0.9773619,6.8838674,6.9138531,0.9808102,0.9801696,0.9850826,0.9879028,0.0688211,0.2766308,588.3867403,591.3853071,0.2759347
5,0.0500085,0.9707387,6.8539375,6.9018751,0.9765458,0.9741906,0.983376,0.9851615,0.0685218,0.3451526,585.3937545,590.1875074,0.3441831
6,0.1000171,0.8111703,6.5338947,6.7178849,0.9309463,0.9264149,0.9571611,0.9557882,0.3267504,0.6719031,553.3894738,571.7884906,0.6669065
7,0.1500043,0.2029434,2.7146381,5.3838486,0.3867804,0.3999849,0.7670882,0.7705731,0.1356972,0.8076002,171.4638146,438.3848576,0.7668575
8,0.2000128,0.1125173,1.082999,4.3085216,0.1543052,0.1491969,0.6138762,0.6152125,0.0541592,0.8617594,8.2999036,330.85216,0.7716978
9,0.3000085,0.0554845,0.6268969,3.0814006,0.08932,0.0786475,0.4390365,0.4363702,0.062687,0.9244464,-37.3103136,208.1400551,0.7281901
10,0.4000043,0.0336732,0.284273,2.3821559,0.0405031,0.0432269,0.3394085,0.3380896,0.0284261,0.9528725,-71.5726959,138.2155939,0.6447289

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9522762,0.0009190,0.9521282,0.9534959,0.950939,0.9525594,0.9522584
aic,2842.189,55.275883,2805.3591,2807.0408,2934.7803,2852.4277,2811.337
auc,0.9548020,0.0029945,0.958557,0.9546075,0.9518593,0.9519554,0.9570309
err,0.0477238,0.0009190,0.0478718,0.0465042,0.0490610,0.0474406,0.0477417
err_count,447.8,11.1892805,451.0,435.0,465.0,443.0,445.0
f0point5,0.868858,0.0026619,0.8732936,0.8675721,0.8672087,0.8693661,0.8668492
f1,0.8151851,0.0076237,0.8156927,0.824526,0.8050314,0.8107646,0.8199109
f2,0.7678605,0.0138369,0.7652201,0.7855496,0.7511737,0.7595646,0.7777948
lift_top_group,6.9744687,0.0839607,6.937408,6.896043,7.0996256,7.0194464,6.9198217
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
%%time 

# View the leaderboard
lb = aml.leaderboard
print(lb)

model_id                                                     auc    logloss     aucpr    mean_per_class_error      rmse        mse
StackedEnsemble_AllModels_3_AutoML_1_20241007_03412     0.95479    0.149991  0.873398                0.137324  0.199284  0.0397141
StackedEnsemble_AllModels_2_AutoML_1_20241007_03412     0.953949   0.150993  0.872062                0.138871  0.199722  0.039889
StackedEnsemble_BestOfFamily_4_AutoML_1_20241007_03412  0.953726   0.151557  0.870955                0.137485  0.200229  0.0400917
StackedEnsemble_BestOfFamily_3_AutoML_1_20241007_03412  0.953704   0.151525  0.871069                0.136961  0.200213  0.0400852
XGBoost_3_AutoML_1_20241007_03412                       0.952726   0.153069  0.869438                0.138371  0.20133   0.0405337
XGBoost_grid_1_AutoML_1_20241007_03412_model_14         0.95261    0.152867  0.86916                 0.133927  0.200867  0.0403474
StackedEnsemble_AllModels_1_AutoML_1_20241007_03412     0.952447   0.152841  0.86842

In [13]:
%%time

# Make predictions on the test set
preds = aml.predict(test_h2o)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
CPU times: user 282 ms, sys: 6.57 ms, total: 288 ms
Wall time: 3.53 s


In [18]:
%%time

# Extract the predicted labels
predicted_labels = preds['predict'].as_data_frame().values.flatten()

CPU times: user 33.4 ms, sys: 2.78 ms, total: 36.2 ms
Wall time: 81.6 ms





In [19]:
%%time

# Create submission DataFrame
submission = sample_submission.copy()
submission['loan_status'] = predicted_labels

CPU times: user 1.88 ms, sys: 0 ns, total: 1.88 ms
Wall time: 1.23 ms


In [20]:
%%time

# Save the submission file
submission_file_path = 'submission.csv'
submission.to_csv(submission_file_path, index=False)

CPU times: user 41.2 ms, sys: 4.92 ms, total: 46.1 ms
Wall time: 45 ms


In [21]:
%%time

submission.head()

CPU times: user 232 µs, sys: 0 ns, total: 232 µs
Wall time: 235 µs


Unnamed: 0,id,loan_status
0,58645,1
1,58646,0
2,58647,1
3,58648,0
4,58649,0
