In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('preprocessed_train.csv')
df_test = pd.read_csv('preprocessed_test.csv')

In [3]:
df.head()

Unnamed: 0,Month,Age,Profession,Income_Annual,Base_Salary_PerMonth,Total_Bank_Accounts,Total_Credit_Cards,Rate_Of_Interest,Total_Current_Loans,Delay_from_due_date,...,Current_Debt_Outstanding,Ratio_Credit_Utilization,Credit_History_Age,Payment_of_Min_Amount,Per_Month_EMI,Monthly_Investment,Payment_Behaviour,Monthly_Balance,Credit_Score,Loan_Count
0,7,51,11,101583.48,8648.29,5,7,10,4,8,...,50.93,34.462154,289,0,190.811017,630.015789,3,314.002193,1,4
1,1,23,14,101926.95,8635.9125,4,4,9,1,13,...,1058.0,39.693812,245,0,70.587681,662.803927,4,410.199642,1,1
2,2,49,14,158871.12,12962.26,0,4,8,1,8,...,576.48,39.367225,228,0,86.90586,746.805985,4,742.514154,1,1
3,6,40,3,60379.28,4804.606667,5,6,18,3,15,...,725.39,29.061701,205,0,90.906385,166.418658,1,473.135623,1,3
4,5,17,0,50050.83,4085.9025,9,10,20,5,28,...,3419.1,30.386321,54,1,190.44506,56.789441,0,401.355749,0,5


In [4]:
X = df.drop('Credit_Score', axis=1)
y = df['Credit_Score']
X_test = df_test.drop('ID', axis=1)

In [9]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameter space
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'objective': 'multi:softmax',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'random_state': 42
    }

    # Initialize and train the model with the suggested parameters
    model = XGBClassifier(**param)
    model.fit(X_train, y_train)

    # Make predictions on the validation set and calculate accuracy
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    # Return the validation accuracy as the metric to optimize
    return accuracy

# Create an Optuna study and optimize it
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

# Train the model on the entire training dataset using the best hyperparameters
best_params = study.best_params
final_model = XGBClassifier(**best_params, random_state=42, eval_metric='mlogloss')  
final_model.fit(X, y)

y_pred = final_model.predict(X)
accuracy_test = accuracy_score(y_pred, y)
print("Test accuracy:", accuracy_test)

[I 2024-11-15 22:47:31,817] A new study created in memory with name: no-name-dfbbdb50-fa0c-44e5-a7a1-415203f15a95


[I 2024-11-15 22:47:32,270] Trial 0 finished with value: 0.7053125 and parameters: {'n_estimators': 59, 'max_depth': 3, 'learning_rate': 0.14864747928682737, 'subsample': 0.6440862624173771, 'colsample_bytree': 0.8748804488329116, 'gamma': 0.4087491570239448, 'min_child_weight': 8, 'reg_alpha': 1.420927447222348e-06, 'reg_lambda': 0.0030833649240681214, 'max_delta_step': 2}. Best is trial 0 with value: 0.7053125.
[I 2024-11-15 22:48:06,055] Trial 1 finished with value: 0.7996875 and parameters: {'n_estimators': 240, 'max_depth': 15, 'learning_rate': 0.03166882563981959, 'subsample': 0.881649445192245, 'colsample_bytree': 0.5693824118721499, 'gamma': 0.02950203609165175, 'min_child_weight': 2, 'reg_alpha': 0.15242063508873677, 'reg_lambda': 1.8905149113002044e-08, 'max_delta_step': 0}. Best is trial 1 with value: 0.7996875.
[I 2024-11-15 22:48:09,709] Trial 2 finished with value: 0.786125 and parameters: {'n_estimators': 276, 'max_depth': 11, 'learning_rate': 0.24308988417895322, 'subsa

Best hyperparameters: {'n_estimators': 206, 'max_depth': 15, 'learning_rate': 0.18078542173338058, 'subsample': 0.9032862786682134, 'colsample_bytree': 0.7628444050995105, 'gamma': 0.021389631401061815, 'min_child_weight': 1, 'reg_alpha': 0.44000513385084034, 'reg_lambda': 5.125995786666999e-08, 'max_delta_step': 4}
Best validation accuracy: 0.803125
Test accuracy: 1.0


In [10]:
y_test = final_model.predict(X_test)

In [11]:
y_pred_test = pd.Series(y_test).map({0: 'Poor', 1: 'Standard', 2: 'Good'})
final = pd.DataFrame({'ID': df_test['ID'], 'Credit_Score': y_pred_test})
final.head()

Unnamed: 0,ID,Credit_Score
0,0x2145,Standard
1,0x7d59,Poor
2,0xe753,Poor
3,0x19813,Standard
4,0x1dc5e,Standard


In [12]:
final.to_csv('final_submission.csv', index=False)