In [1]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMTunerCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import tensorflow_decision_forests as tfdf
import tensorflow as tf

2024-10-18 02:34:04.751844: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-18 02:34:04.789736: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-18 02:34:04.810545: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-18 02:34:04.825828: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-18 02:34:04.843081: I tensorflow/core/platform/cpu_feature_guar

In [2]:
df = pd.read_csv('RNNs/Tensorflow-Keras/Binary Classification/train.csv')
df_test = pd.read_csv('RNNs/Tensorflow-Keras/Binary Classification/test.csv')

In [3]:
label_encoder = LabelEncoder()

# Columns to be label encoded
columns_to_encode = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

# Apply label encoding to each specified column
for column in columns_to_encode:
    df[f'{column}_encoded'] = label_encoder.fit_transform(df[column])
    df_test[f'{column}_encoded'] = label_encoder.fit_transform(df_test[column])
# Display the first few rows of the dataframe to verify the changes
df.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,person_home_ownership_encoded,loan_intent_encoded,loan_grade_encoded,cb_person_default_on_file_encoded
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0,3,1,1,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0,2,3,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0,2,4,0,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0,3,5,1,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0,3,3,0,0


In [4]:
features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'person_home_ownership_encoded', 'loan_intent_encoded', 'loan_grade_encoded', 'cb_person_default_on_file_encoded']
label = 'loan_status'

In [5]:
# Define the parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid, n_iter=50, scoring='accuracy', cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search.fit(df[features], df[label])

# Get the best estimator
best_xgb_clf = random_search.best_estimator_

# Make predictions on the test dataset
xgb_predictions = best_xgb_clf.predict(df_test[features])

# Add the predictions to the test dataframe
df_test['xgb_predicted_loan_status'] = xgb_predictions

# Display the first few rows of the test dataframe to verify the predictions
df_test.head()

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_encoded,loan_intent_encoded,loan_grade_encoded,cb_person_default_on_file_encoded,xgb_predicted_loan_status
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2,3,2,5,0,1
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4,0,4,2,1,0
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2,3,5,4,1,1
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7,3,0,0,0,0
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4,0,2,3,1,0


In [6]:
# Define the dataset for LightGBM
train_data = lgb.Dataset(df[features], label=df[label])

# Define the parameter search space
param_grid = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'n_jobs': -1,
    'seed': 42
}

# Define the tuning function
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'verbosity': -1
    }
    
    cv_results = lgb.cv(
        params,
        train_data,
        nfold=3,
        stratified=True,
        shuffle=True,
        metrics='binary_logloss',
        seed=42,
        eval_train_metric=True,
    )
    return min(cv_results['train binary_logloss-mean'])

# Create the study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params
best_params.update(param_grid)

# Train the final model with the best parameters
lgb_clf = lgb.LGBMClassifier(**best_params)
lgb_clf.fit(df[features], df[label])

# Make predictions on the test dataset
lgb_predictions = lgb_clf.predict(df_test[features])

# Add the predictions to the test dataframe
df_test['lgb_predicted_loan_status'] = lgb_predictions

# Display the first few rows of the test dataframe to verify the predictions
df_test.head()

[I 2024-10-18 02:34:36,551] A new study created in memory with name: no-name-7f8935f3-07f4-4b56-940c-2698b5dd7689
[I 2024-10-18 02:34:49,300] Trial 0 finished with value: 0.14873660272354394 and parameters: {'num_leaves': 123, 'max_depth': 6, 'learning_rate': 0.0118209867778185, 'n_estimators': 415, 'subsample': 0.9234400859227202, 'colsample_bytree': 0.7019559781182263, 'min_child_weight': 2}. Best is trial 0 with value: 0.14873660272354394.
[I 2024-10-18 02:35:00,460] Trial 1 finished with value: 0.24446176308038403 and parameters: {'num_leaves': 125, 'max_depth': 9, 'learning_rate': 0.001258782905216087, 'n_estimators': 486, 'subsample': 0.8379989062243944, 'colsample_bytree': 0.9973442290534018, 'min_child_weight': 6}. Best is trial 0 with value: 0.14873660272354394.
[I 2024-10-18 02:35:05,511] Trial 2 finished with value: 0.0959313035259561 and parameters: {'num_leaves': 21, 'max_depth': 15, 'learning_rate': 0.08338574320176234, 'n_estimators': 780, 'subsample': 0.7286264220443686

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_encoded,loan_intent_encoded,loan_grade_encoded,cb_person_default_on_file_encoded,xgb_predicted_loan_status,lgb_predicted_loan_status
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2,3,2,5,0,1,1
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4,0,4,2,1,0,0
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2,3,5,4,1,1,1
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7,3,0,0,0,0,0
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4,0,2,3,1,0,0


In [7]:
# Define the parameter grid for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 500, 1000]
}

# Initialize the Logistic Regression classifier
log_reg = LogisticRegression(random_state=42)

# Initialize RandomizedSearchCV
random_search_log_reg = RandomizedSearchCV(estimator=log_reg, param_distributions=param_grid, n_iter=50, scoring='accuracy', cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_log_reg.fit(df[features], df[label])

# Get the best estimator
best_log_reg = random_search_log_reg.best_estimator_

# Make predictions on the test dataset
log_reg_predictions = best_log_reg.predict(df_test[features])

# Add the predictions to the test dataframe
df_test['log_reg_predicted_loan_status'] = log_reg_predictions

# Display the first few rows of the test dataframe to verify the predictions
df_test.head()

Fitting 3 folds for each of 50 candidates, totalling 150 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_encoded,loan_intent_encoded,loan_grade_encoded,cb_person_default_on_file_encoded,xgb_predicted_loan_status,lgb_predicted_loan_status,log_reg_predicted_loan_status
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2,3,2,5,0,1,1,1
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4,0,4,2,1,0,0,0
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2,3,5,4,1,1,1,1
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7,3,0,0,0,0,0,0
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4,0,2,3,1,0,0,0


In [8]:
# Define the parameter grid for Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}

# Initialize the Decision Tree classifier
dt_clf = DecisionTreeClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search_dt = RandomizedSearchCV(estimator=dt_clf, param_distributions=param_grid, n_iter=50, scoring='accuracy', cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
random_search_dt.fit(df[features], df[label])

# Get the best estimator
best_dt_clf = random_search_dt.best_estimator_

# Make predictions on the test dataset
dt_predictions = best_dt_clf.predict(df_test[features])

# Add the predictions to the test dataframe
df_test['dt_predicted_loan_status'] = dt_predictions

# Display the first few rows of the test dataframe to verify the predictions
df_test.head()

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_encoded,loan_intent_encoded,loan_grade_encoded,cb_person_default_on_file_encoded,xgb_predicted_loan_status,lgb_predicted_loan_status,log_reg_predicted_loan_status,dt_predicted_loan_status
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,N,2,3,2,5,0,1,1,1,1
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,Y,4,0,4,2,1,0,0,0,0
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,Y,2,3,5,4,1,1,1,1,0
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,N,7,3,0,0,0,0,0,0,0
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,Y,4,0,2,3,1,0,0,0,0


In [9]:
df_test[['id', 'xgb_predicted_loan_status']].to_csv('RNNs/Tensorflow-Keras/Binary Classification/XGB_predictions.csv', index=False)
df_test[['id', 'lgb_predicted_loan_status']].to_csv('RNNs/Tensorflow-Keras/Binary Classification/LGB_predictions.csv', index=False)
df_test[['id', 'log_reg_predicted_loan_status']].to_csv('RNNs/Tensorflow-Keras/Binary Classification/LogReg_predictions.csv', index=False)
df_test[['id', 'dt_predicted_loan_status']].to_csv('RNNs/Tensorflow-Keras/Binary Classification/DT_predictions.csv', index=False)

In [10]:
# Create a pool of all predicted loan statuses
df_test['pooled_predicted_loan_status'] = df_test[['xgb_predicted_loan_status', 'lgb_predicted_loan_status', 'log_reg_predicted_loan_status', 'dt_predicted_loan_status']].mode(axis=1)[0]
df_test['pooled_predicted_loan_status'] = df_test['pooled_predicted_loan_status'].astype(int)
# Display the first few rows of the test dataframe to verify the pooled predictions
df_test.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,...,cb_person_cred_hist_length,person_home_ownership_encoded,loan_intent_encoded,loan_grade_encoded,cb_person_default_on_file_encoded,xgb_predicted_loan_status,lgb_predicted_loan_status,log_reg_predicted_loan_status,dt_predicted_loan_status,pooled_predicted_loan_status
0,58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.36,...,2,3,2,5,0,1,1,1,1,1
1,58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.1,...,4,0,4,2,1,0,0,0,0,0
2,58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.13,...,2,3,5,4,1,1,1,1,0,1
3,58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.9,0.14,...,7,3,0,0,0,0,0,0,0,0
4,58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.15,...,4,0,2,3,1,0,0,0,0,0


In [11]:
df_test[['id', 'pooled_predicted_loan_status']].to_csv('RNNs/Tensorflow-Keras/Binary Classification/Pooled_predictions.csv', index=False)

In [12]:
# Prepare the dataset for TensorFlow Decision Forests
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df[features + [label]], label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df_test[features + ['pooled_predicted_loan_status']], label='pooled_predicted_loan_status')

# Define the tuning function
def objective(trial):
    params = {
        'num_trees': trial.suggest_int('num_trees', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_examples': trial.suggest_int('min_examples', 1, 10),
        'sampling_ratio': trial.suggest_uniform('sampling_ratio', 0.6, 1.0)
    }
    
    model = tfdf.keras.GradientBoostedTreesModel(
        num_trees=params['num_trees'],
        max_depth=params['max_depth'],
        min_examples=params['min_examples'],
        sampling_ratio=params['sampling_ratio']
    )
    
    model.compile(metrics=["accuracy"])
    model.fit(train_ds)
    
    evaluation = model.evaluate(test_ds, return_dict=True)
    return evaluation["accuracy"]

# Create the study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params

# Train the final model with the best parameters
final_model = tfdf.keras.GradientBoostedTreesModel(
    num_trees=best_params['num_trees'],
    max_depth=best_params['max_depth'],
    min_examples=best_params['min_examples'],
    sampling_ratio=best_params['sampling_ratio']
)

final_model.compile(metrics=["accuracy"])
final_model.fit(train_ds)

# Make predictions on the test dataset
tfdf_predictions = final_model.predict(test_ds)
df_test['tfdf_predicted_loan_status'] = tfdf_predictions.argmax(axis=1)

# Display the first few rows of the test dataframe to verify the predictions
df_test.head()

I0000 00:00:1729230211.053373 1202413 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1729230211.731015 1202413 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1729230211.731535 1202413 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1729230211.738552 1202413 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1729230211.739006 1202413 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

TypeError: GradientBoostedTreesModel.__init__() got an unexpected keyword argument 'sampling_ratio'

In [None]:
df_test[['id', 'tfdf_predicted_loan_status']].to_csv('RNNs/Tensorflow-Keras/Binary Classification/TFDF_predictions.csv', index=False)