In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('cleaned_train.csv')
df_test = pd.read_csv('cleaned_test.csv')

In [3]:
df.head()

Unnamed: 0,Month,Age,Profession,Income_Annual,Base_Salary_PerMonth,Total_Bank_Accounts,Total_Credit_Cards,Rate_Of_Interest,Total_Current_Loans,Delay_from_due_date,...,Current_Debt_Outstanding,Ratio_Credit_Utilization,Credit_History_Age,Payment_of_Min_Amount,Per_Month_EMI,Monthly_Investment,Payment_Behaviour,Monthly_Balance,Credit_Score,Loan_Count
0,7,51,11,101583.48,8648.29,5,7,10,4,8,...,50.93,34.462154,289,0,190.811017,630.015789,3,314.002193,1,4
1,1,23,14,101926.95,8635.9125,4,4,9,1,13,...,1058.0,39.693812,245,0,70.587681,662.803927,4,410.199642,1,1
2,2,49,14,158871.12,12962.26,0,4,8,1,8,...,576.48,39.367225,228,0,86.90586,746.805985,4,742.514154,1,1
3,6,40,3,60379.28,4804.606667,5,6,18,3,15,...,725.39,29.061701,205,0,90.906385,166.418658,1,473.135623,1,3
4,5,17,0,50050.83,4085.9025,9,10,20,5,28,...,3419.1,30.386321,54,1,190.44506,56.789441,0,401.355749,0,5


In [4]:
X = df.drop('Credit_Score', axis=1)
y = df['Credit_Score']
X_test = df_test.drop('ID', axis=1)

In [5]:

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameter space
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'objective': 'multi:softmax',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'random_state': 42
    }

    # Initialize and train the model with the suggested parameters
    model = XGBClassifier(**param)
    model.fit(X_train, y_train)

    # Make predictions on the validation set and calculate accuracy
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    # Return the validation accuracy as the metric to optimize
    return accuracy

# Create an Optuna study and optimize it
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

# Train the model on the entire training dataset using the best hyperparameters
best_params = study.best_params
final_model = XGBClassifier(**best_params, random_state=42, eval_metric='mlogloss')  
final_model.fit(X, y)

y_pred = final_model.predict(X)
accuracy_test = accuracy_score(y_pred, y)
print("Test accuracy:", accuracy_test)


# Make predictions on the test set
y_test = final_model.predict(X_test)
y_pred_test = pd.Series(y_test).map({0: 'Poor', 1: 'Standard', 2: 'Good'})
final = pd.DataFrame({'ID': df_test['ID'], 'Credit_Score': y_pred_test})
final.to_csv('xgboost.csv', index=False)

[I 2024-12-11 17:01:51,245] A new study created in memory with name: no-name-18ce5046-f9b7-4fda-b58a-480a242f635f
[I 2024-12-11 17:01:56,817] Trial 0 finished with value: 0.7564497497112053 and parameters: {'n_estimators': 253, 'max_depth': 6, 'learning_rate': 0.18541318599500997, 'subsample': 0.9944301878178303, 'colsample_bytree': 0.9791310221481373, 'gamma': 0.43322806623292665, 'min_child_weight': 2, 'reg_alpha': 0.013929420197499794, 'reg_lambda': 0.0015591348852878727, 'max_delta_step': 0}. Best is trial 0 with value: 0.7564497497112053.
[I 2024-12-11 17:02:05,322] Trial 1 finished with value: 0.7868694647670389 and parameters: {'n_estimators': 79, 'max_depth': 13, 'learning_rate': 0.12006234922715557, 'subsample': 0.9857472109239491, 'colsample_bytree': 0.939375578717521, 'gamma': 0.22196217366377113, 'min_child_weight': 9, 'reg_alpha': 8.27172541365931e-06, 'reg_lambda': 4.605148005023803e-05, 'max_delta_step': 5}. Best is trial 1 with value: 0.7868694647670389.
[I 2024-12-11 1

Best hyperparameters: {'n_estimators': 219, 'max_depth': 14, 'learning_rate': 0.08189037499540956, 'subsample': 0.8876331264394978, 'colsample_bytree': 0.5019541642648715, 'gamma': 0.10476141642744874, 'min_child_weight': 1, 'reg_alpha': 1.6206618800909855e-07, 'reg_lambda': 0.007028627811490345, 'max_delta_step': 2}
Best validation accuracy: 0.8047747400847132
Test accuracy: 1.0


In [None]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameter space
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42
    }
    
    # Initialize and train the model with the suggested parameters
    model = RandomForestClassifier(**param)
    model.fit(X_train, y_train)

    # Make predictions on the validation set and calculate accuracy
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    # Return the validation accuracy as the metric to optimize
    return accuracy

# Create an Optuna study and optimize it
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

# Train the model on the entire training dataset using the best hyperparameters
best_params = study.best_params
rand_model = RandomForestClassifier(**best_params, random_state=42)
rand_model.fit(X, y)

# Make predictions on the test set and evaluate
y_pred = rand_model.predict(X)
accuracy_test = accuracy_score(y, y_pred)
print("Test accuracy:", accuracy_test)



# Make predictions on the test set
y_test = final_model.predict(X_test)
y_pred_test = pd.Series(y_test).map({0: 'Poor', 1: 'Standard', 2: 'Good'})
final = pd.DataFrame({'ID': df_test['ID'], 'Credit_Score': y_pred_test})
final.to_csv('random_forest.csv', index=False)

[I 2024-12-11 17:16:39,555] A new study created in memory with name: no-name-76310406-8555-47e5-8ad2-c8d41cf95847
[I 2024-12-11 17:16:44,259] Trial 0 finished with value: 0.6911821332306507 and parameters: {'n_estimators': 55, 'max_depth': 8, 'max_features': 'log2', 'max_leaf_nodes': 27, 'min_samples_split': 4, 'min_samples_leaf': 3, 'bootstrap': True}. Best is trial 0 with value: 0.6911821332306507.
[I 2024-12-11 17:17:31,181] Trial 1 finished with value: 0.7088948787061995 and parameters: {'n_estimators': 115, 'max_depth': 11, 'max_features': None, 'max_leaf_nodes': 20, 'min_samples_split': 8, 'min_samples_leaf': 4, 'bootstrap': True}. Best is trial 1 with value: 0.7088948787061995.
[I 2024-12-11 17:17:51,237] Trial 2 finished with value: 0.6995892696701322 and parameters: {'n_estimators': 156, 'max_depth': 8, 'max_features': 'sqrt', 'max_leaf_nodes': 44, 'min_samples_split': 7, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 1 with value: 0.7088948787061995.
[I 2024-12-11 

Best hyperparameters: {'n_estimators': 138, 'max_depth': 9, 'max_features': None, 'max_leaf_nodes': 50, 'min_samples_split': 6, 'min_samples_leaf': 4, 'bootstrap': True}
Best validation accuracy: 0.7147991271980491
Test accuracy: 0.7144415777786335
