In [1]:
pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0


In [2]:
pip install optuna-integration[lightgbm]

Collecting optuna-integration[lightgbm]
  Downloading optuna_integration-4.4.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.4.0-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.9/98.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.4.0


In [3]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [4]:
pip install imbalanced-learn



In [5]:
pip install lightgbm --upgrade



In [6]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import optuna
from optuna.integration import LightGBMPruningCallback
import category_encoders as ce
from imblearn.over_sampling import SMOTE

In [7]:
# Extract the uploaded zip file
zip_file_path = '/content/playground-series-s4e10.zip'
extract_dir = '/content/playground-series-s4e10/'

In [8]:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [9]:
# Load the datasets
train_df = pd.read_csv(os.path.join(extract_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(extract_dir, 'test.csv'))
sample_submission_df = pd.read_csv(os.path.join(extract_dir, 'sample_submission.csv'))


In [10]:
# Reset index to 'id' for both datasets
train_df.set_index('id', inplace=True)
test_df.set_index('id', inplace=True)

In [11]:
# Identify categorical and numerical features
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
numerical_features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
                      'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']


In [12]:
# Feature Engineering: Create new features
def feature_engineering(df):
    # Income to loan amount ratio
    df['income_loan_ratio'] = df['person_income'] / df['loan_amnt']
    # Employment length to age ratio
    df['emp_age_ratio'] = df['person_emp_length'] / df['person_age']
    # Interest rate to income ratio
    df['int_rate_income_ratio'] = df['loan_int_rate'] / df['person_income']
    # Credit history length to age ratio
    df['cred_hist_age_ratio'] = df['cb_person_cred_hist_length'] / df['person_age']
    return df

In [13]:
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

In [14]:
# Update numerical features with new features
numerical_features.extend(['income_loan_ratio', 'emp_age_ratio', 'int_rate_income_ratio', 'cred_hist_age_ratio'])


In [15]:
# Separate features and target
X = train_df.drop(columns='loan_status')
y = train_df['loan_status']


In [16]:
# Initialize Target Encoder
target_enc = ce.TargetEncoder(cols=categorical_features)


In [17]:
# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [18]:
# Prepare arrays for out-of-fold predictions
oof_preds = np.zeros(X.shape[0])
test_preds = np.zeros(test_df.shape[0])

In [19]:
# Define objective function for Optuna
def objective(trial):
    aucs = []
    for train_index, valid_index in skf.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_index], X.iloc[valid_index]
        y_train_fold, y_valid_fold = y.iloc[train_index], y.iloc[valid_index]

        # Target Encoding
        X_train_fold = target_enc.fit_transform(X_train_fold, y_train_fold)
        X_valid_fold = target_enc.transform(X_valid_fold)
        X_test_enc = target_enc.transform(test_df)

        # Handle class imbalance with SMOTE
        sm = SMOTE(random_state=42)
        X_resampled, y_resampled = sm.fit_resample(X_train_fold, y_train_fold)

        # Scale numerical features
        scaler = StandardScaler()
        X_resampled[numerical_features] = scaler.fit_transform(X_resampled[numerical_features])
        X_valid_fold[numerical_features] = scaler.transform(X_valid_fold[numerical_features])
        X_test_enc[numerical_features] = scaler.transform(X_test_enc[numerical_features])

        # Define LightGBM parameters using new suggest methods
        param = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 20, 300),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'subsample': trial.suggest_float('subsample', 0.4, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
            'random_state': 42,
            'verbosity': -1,
            'n_jobs': -1
        }

        lgb_train = lgb.Dataset(X_resampled, y_resampled)
        lgb_valid = lgb.Dataset(X_valid_fold, y_valid_fold, reference=lgb_train)

        # Use early_stopping and log_evaluation as callbacks
        pruning_callback = LightGBMPruningCallback(trial, 'auc')
        early_stopping_callback = lgb.early_stopping(stopping_rounds=100, verbose=False)
        log_eval_callback = lgb.log_evaluation(period=0)  # Suppress logging during optimization

        callbacks = [pruning_callback, early_stopping_callback, log_eval_callback]

        # Include valid_names to ensure the validation dataset is named 'valid_0'
        gbm = lgb.train(
            param,
            lgb_train,
            num_boost_round=10000,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=['training', 'valid_0'],  # Specify names here
            callbacks=callbacks
        )

        y_valid_pred = gbm.predict(X_valid_fold, num_iteration=gbm.best_iteration)
        auc = roc_auc_score(y_valid_fold, y_valid_pred)
        aucs.append(auc)

    return np.mean(aucs)

In [20]:
# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='maximize', study_name='lgbm_classifier')
study.optimize(objective, n_trials=50)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[I 2025-08-08 16:50:04,577] Trial 21 finished with value: 0.9567749082806651 and parameters: {'learning_rate': 0.025188703230500775, 'num_leaves': 106, 'max_depth': 16, 'min_child_samples': 56, 'subsample': 0.5520422403240267, 'colsample_bytree': 0.6084746097447352, 'reg_alpha': 0.0007676066919034816, 'reg_lambda': 0.007117729401932272}. Best is trial 0 with value: 0.957165585322049.
[I 2025-08-08 16:50:06,029] Trial 22 pruned. Trial was pruned at iteration 0.
[I 2025-08-08 16:50:07,284] Trial 23 pruned. Trial was pruned at iteration 18.
[I 2025-08-08 16:50:07,908] Trial 24 pruned. Trial was pruned at iteration 0.
[I 2025-08-08 16:50:08,660] Trial 25 pruned. Trial was pruned at iteration 8.
[I 2025-08-08 16:50:09,315] Trial 26 pruned. Trial was pruned at iteration 1.
[I 2025-08-08 16:50:09,930] Trial 27 pruned. Trial was pruned at iteration 0.
[I 2025-08-08 16:50:10,556] Trial 28 pruned. Trial was pruned at iteration 0.
[

In [21]:
# Retrieve the best parameters
best_params = study.best_params
best_params['objective'] = 'binary'
best_params['metric'] = 'auc'
best_params['boosting_type'] = 'gbdt'
best_params['random_state'] = 42
best_params['verbosity'] = -1
best_params['n_jobs'] = -1

In [22]:
print('Best Hyperparameters:')
print(best_params)


Best Hyperparameters:
{'learning_rate': 0.008490612568447669, 'num_leaves': 277, 'max_depth': 7, 'min_child_samples': 42, 'subsample': 0.6972130994946316, 'colsample_bytree': 0.6970422287399669, 'reg_alpha': 1.262184242031126, 'reg_lambda': 0.10118255698787165, 'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt', 'random_state': 42, 'verbosity': -1, 'n_jobs': -1}


In [23]:
# Train the model with best hyperparameters and make predictions
for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
    print(f'Fold {fold + 1}')
    X_train_fold, X_valid_fold = X.iloc[train_index], X.iloc[valid_index]
    y_train_fold, y_valid_fold = y.iloc[train_index], y.iloc[valid_index]

    # Target Encoding
    X_train_fold = target_enc.fit_transform(X_train_fold, y_train_fold)
    X_valid_fold = target_enc.transform(X_valid_fold)
    X_test_enc = target_enc.transform(test_df)

    # Handle class imbalance with SMOTE
    sm = SMOTE(random_state=42)
    X_resampled, y_resampled = sm.fit_resample(X_train_fold, y_train_fold)

    # Scale numerical features
    scaler = StandardScaler()
    X_resampled[numerical_features] = scaler.fit_transform(X_resampled[numerical_features])
    X_valid_fold[numerical_features] = scaler.transform(X_valid_fold[numerical_features])
    X_test_enc[numerical_features] = scaler.transform(X_test_enc[numerical_features])

    lgb_train = lgb.Dataset(X_resampled, y_resampled)
    lgb_valid = lgb.Dataset(X_valid_fold, y_valid_fold, reference=lgb_train)

    # Use early_stopping and log_evaluation as callbacks
    early_stopping_callback = lgb.early_stopping(stopping_rounds=100, verbose=False)
    log_eval_callback = lgb.log_evaluation(period=100)

    callbacks = [early_stopping_callback, log_eval_callback]

    # Include valid_names here as well
    gbm = lgb.train(
        best_params,
        lgb_train,
        num_boost_round=10000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['training', 'valid_0'],
        callbacks=callbacks
    )

    # Predict on validation set
    y_valid_pred = gbm.predict(X_valid_fold, num_iteration=gbm.best_iteration)
    oof_preds[valid_index] = y_valid_pred

    # Predict on test set
    test_fold_pred = gbm.predict(X_test_enc, num_iteration=gbm.best_iteration)
    test_preds += test_fold_pred / skf.n_splits

Fold 1
[100]	training's auc: 0.98602	valid_0's auc: 0.920145
[200]	training's auc: 0.987094	valid_0's auc: 0.921728
[300]	training's auc: 0.98822	valid_0's auc: 0.924429
[400]	training's auc: 0.989544	valid_0's auc: 0.928088
[500]	training's auc: 0.990736	valid_0's auc: 0.931955
[600]	training's auc: 0.991871	valid_0's auc: 0.93559
[700]	training's auc: 0.992666	valid_0's auc: 0.93824
[800]	training's auc: 0.99353	valid_0's auc: 0.941802
[900]	training's auc: 0.994113	valid_0's auc: 0.943908
[1000]	training's auc: 0.99456	valid_0's auc: 0.945317
[1100]	training's auc: 0.995028	valid_0's auc: 0.946688
[1200]	training's auc: 0.995441	valid_0's auc: 0.948137
[1300]	training's auc: 0.995753	valid_0's auc: 0.949089
[1400]	training's auc: 0.99603	valid_0's auc: 0.949718
[1500]	training's auc: 0.996295	valid_0's auc: 0.950488
[1600]	training's auc: 0.996514	valid_0's auc: 0.950896
[1700]	training's auc: 0.996725	valid_0's auc: 0.95122
[1800]	training's auc: 0.996923	valid_0's auc: 0.95159
[19

In [None]:
# Evaluate the overall model performance
roc_auc = roc_auc_score(y, oof_preds)
print(f'Overall ROC-AUC Score: {roc_auc}')

In [None]:
# Prepare the submission file
submission_df = pd.DataFrame({'id': test_df.index, 'loan_status': test_preds})
submission_file_path = '/content/loan_approval_submission_optimized.csv'
submission_df.to_csv(submission_file_path, index=False)

In [26]:
# Display the first few rows of the submission file
print(submission_df.head())

      id  loan_status
0  58645     0.993779
1  58646     0.014829
2  58647     0.607898
3  58648     0.010350
4  58649     0.062502
      id  loan_status
0  58645     0.993779
1  58646     0.014829
2  58647     0.607898
3  58648     0.010350
4  58649     0.062502
