## Importing Datasets

In [96]:
# Importing libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [98]:
# Loading the dataset
data = pd.read_csv("dataset/loan_data.csv")
original_data = data.copy()
data["Default"] = data["Default"].astype(bool)

In [99]:
# Analysing the dataset
print(f"DATA PREVIEW: \n{data.head()}\n")

# Handling Missing Values
print(f"MISSING VALUES: \n{data.isnull().sum()}\n")
print(f"DATATYPES: \n{data.dtypes}\n")
print(f"CLASS BALANCE: \n{data["LoanApprovalStatus"].value_counts(normalize = True)}\n")

DATA PREVIEW: 
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced         Yes           Yes       Other   
1    

## One-Hot Encoding

In [103]:
# Identify categorical columns (excluding numerical ones)
cat_columns = data.select_dtypes(include=['object']).columns.tolist()

# Exclude LoanID from categorical columns before encoding
cat_columns = [col for col in cat_columns if col != "LoanID"]

# Manually add 'Default' as a categorical column
cat_columns.append("Default")

print(f"Categorical columns: {cat_columns}")

Categorical columns: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner', 'LoanApprovalStatus', 'Default']


In [105]:
# Check if any categorical columns exist before encoding
if not cat_columns:
    print("⚠️ No categorical columns found for encoding!")
else:
    # One-Hot Encoding for categorical features
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_cat_features = encoder.fit_transform(data[cat_columns])

    # Convert encoded categorical features into a DataFrame
    encoded_df = pd.DataFrame(encoded_cat_features, columns=encoder.get_feature_names_out(cat_columns))

    # Drop original categorical columns and concatenate encoded data
    data = data.drop(columns=cat_columns).reset_index(drop=True)
    data = pd.concat([data, encoded_df], axis=1)

    print(f"✅ Categorical encoding complete! Updated data preview:\n{data.head()}")

✅ Categorical encoding complete! Updated data preview:
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio  ...  \
0               4         15.23        36      0.44  ...   
1               1          4.81        60      0.68  ...   
2               3         21.17        24      0.31  ...   
3               3          7.07        24      0.23  ...   
4               4          6.51        48      0.73  ...   

   LoanPurpose_Business  LoanPurpose_Education  LoanPurpose_Home  \
0                   0.0                    0.0               0.0   
1                   0

In [107]:
# Identify numerical columns
num_columns = original_data.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Ensure only existing numerical columns are used for scaling
num_columns = [col for col in num_columns if col in data.columns]

print(f"Numerical Columns before scaling: {num_columns}")



Numerical Columns before scaling: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']


## Splitting Data

In [110]:
from sklearn.model_selection import train_test_split

# 🚀 Step 7: Drop LoanID (if it exists)
if "LoanID" in data.columns:
    data = data.drop(columns=["LoanID"])
    

# 🔹 Extract Target Column
y = data["LoanApprovalStatus_Approved"].astype(int)  # Convert target to numeric
X = data.drop(columns=["LoanApprovalStatus_Approved", "LoanApprovalStatus_Rejected"], errors='ignore')
X = X.drop(columns=["LoanID"], errors='ignore')  # Remove LoanID if it exists

# 🚀 Split data BEFORE applying SMOTE
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"✅ Data Split | Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


✅ Data Split | Train: 178742, Val: 38302, Test: 38303


## SMOTE (Synthetic Minority Over-sampling Technique)

In [113]:
from imblearn.over_sampling import SMOTE

# 🚀 Apply SMOTE to only the training data
smote = SMOTE(sampling_strategy=0.7, random_state=42)  # Adjust ratio if needed
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"✅ SMOTE Applied | New Train Samples: {len(X_train_smote)} (Before: {len(X_train)})")


✅ SMOTE Applied | New Train Samples: 206454 (Before: 178742)


In [115]:
# ✅ Correct Dataset Size Comparison
print(f"Dataset size before SMOTE: {X_train.shape[0]} samples")
print(f"Dataset size after SMOTE: {X_train_smote.shape[0]} samples")
print(f"🔍 New samples added: {X_train_smote.shape[0] - X_train.shape[0]}")

# ✅ Class Distribution Before and After SMOTE
before_smote = pd.DataFrame({
    "Count": y_train.value_counts(),
    "Percentage": y_train.value_counts(normalize=True) * 100
})

after_smote = pd.DataFrame({
    "Count": y_train_smote.value_counts(),
    "Percentage": y_train_smote.value_counts(normalize=True) * 100
})

print("\n\n🔴 Before SMOTE:\n", before_smote)
print("\n✅ After SMOTE:\n", after_smote)


Dataset size before SMOTE: 178742 samples
Dataset size after SMOTE: 206454 samples
🔍 New samples added: 27712


🔴 Before SMOTE:
                               Count  Percentage
LoanApprovalStatus_Approved                    
0                            121444    67.94374
1                             57298    32.05626

✅ After SMOTE:
                               Count  Percentage
LoanApprovalStatus_Approved                    
0                            121444   58.823757
1                             85010   41.176243


## Feature Scaling

In [118]:
# 🚀 Initialize StandardScaler and Apply Scaling
scaler = StandardScaler()

# Select numerical columns
num_cols = X_train_smote.select_dtypes(include=['int64', 'float64']).columns

# Fit scaler on training data & transform
X_train_scaled = X_train_smote.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()

X_train_scaled[num_cols] = scaler.fit_transform(X_train_smote[num_cols])
X_val_scaled[num_cols] = scaler.transform(X_val[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

print("✅ Feature Scaling Applied")

✅ Feature Scaling Applied


## Train the Model

In [121]:
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np
import joblib


# 🚀 Check class balance to adjust `scale_pos_weight`
unique, counts = np.unique(y_train, return_counts=True)
scale_pos_weight = counts[0] / counts[1] if counts[1] > 0 else 1  # Prevent division by zero

# 🚀 Optimized Hyperparameters
best_params = {
    "objective": "binary:logistic",
    "eval_metric": "aucpr",
    "random_state": 42,
    "max_depth": 2,
    "learning_rate": 0.1,
    "subsample": 0.7,
    "colsample_bytree": 0.65,
    "reg_lambda": 100,
    "reg_alpha": 50,
    "gamma": 0.5,
    "n_jobs": 4,
    "scale_pos_weight": scale_pos_weight
}

# 🚀 Train XGBClassifier (Manually Implementing Early Stopping)
xgb_model = xgb.XGBClassifier(**best_params, n_estimators=50)  # ✅ No early stopping argument

best_iteration = 0
best_score = float("-inf")
patience = 20  # Stop if no improvement for 20 rounds
wait = 0

for i in range(1, 51):  # Train up to 50 boosting rounds
    xgb_model.n_estimators = i
    xgb_model.fit(X_train_scaled, y_train_smote)  # Train with the updated estimator count
    
    # Validate the model
    y_val_pred = xgb_model.predict(X_val_scaled)
    score = accuracy_score(y_val, y_val_pred)
    
    # Check improvement
    if score > best_score:
        best_score = score
        best_iteration = i
        wait = 0  # Reset patience counter
    else:
        wait += 1  # Increase patience counter
    
    if wait >= patience:
        print(f"✅ Early stopping triggered at {best_iteration} iterations.")
        break  # Stop training

# Set the final model to best iteration
xgb_model.n_estimators = best_iteration
xgb_model.fit(X_train_scaled, y_train_smote)  # Retrain on best iteration

print(f"✅ Model trained with best iteration: {best_iteration}")


✅ Early stopping triggered at 11 iterations.
✅ Model trained with best iteration: 11


## Evaluate the Model

In [124]:
# 🚀 Make Predictions
y_pred = (xgb_model.predict(X_test_scaled) > 0.5).astype(int)

# 🚀 Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"✅ Model Accuracy on Test Data: {accuracy:.4f}")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall: {recall:.4f}")
print(f"✅ F1 Score: {f1:.4f}")

# 🚀 Cross-validation
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=cv, scoring='accuracy')

print(f"⚡ Cross-validation Accuracy: {np.mean(cv_scores):.4f}")

✅ Model Accuracy on Test Data: 0.9526
✅ Precision: 0.8966
✅ Recall: 0.9634
✅ F1 Score: 0.9288


ValueError: Found input variables with inconsistent numbers of samples: [206454, 178742]

## Save & Deploy model

In [33]:
import joblib


# Save the trained model
joblib.dump(xgb_model, "loan_model.pkl")
print("✅ Model trained and saved successfully!")

# Save the scaler
joblib.dump(scaler, "scaler.pkl")
print("✅ Scaler saved successfully!")

# Load the model and scaler for verification
model = joblib.load("loan_model.pkl")
scaler = joblib.load("scaler.pkl")

print(model)  # Should print XGBClassifier with correct parameters
print(scaler)  # Should print StandardScaler() with fitted parameters


✅ Model trained and saved successfully!
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.65, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='aucpr', feature_types=None,
              feature_weights=None, gamma=0.5, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=2,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=35,
              n_jobs=4, num_parallel_tree=None, ...)


In [27]:
# ### Training various models
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# # Define models
# models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000),
#     "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=50, min_samples_leaf=25),
#     "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=50, min_samples_leaf=25),
#     "AdaBoost": AdaBoostClassifier(),
#     "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
# }

# # Train and evaluate models
# results = {}
# for name, model in models.items():
#     print(f"Training {name}...")
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
    
#     # Compute metrics
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)

#     results[name] = {
#         "Accuracy": accuracy,
#         "Precision": precision,
#         "Recall": recall,
#         "F1 Score": f1
#     }

# # Print results
# print("\nModel Performance:")
# for model, metrics in results.items():
#     print(f"\n{model}:")
#     for metric, value in metrics.items():
#         print(f"{metric}: {value:.4f}")


In [35]:
print(f"DATATYPES: \n{data.dtypes}\n")

DATATYPES: 
Age                               int64
Income                            int64
LoanAmount                        int64
CreditScore                       int64
MonthsEmployed                    int64
NumCreditLines                    int64
InterestRate                    float64
LoanTerm                          int64
DTIRatio                        float64
Education_Bachelor's            float64
Education_High School           float64
Education_Master's              float64
Education_PhD                   float64
EmploymentType_Full-time        float64
EmploymentType_Part-time        float64
EmploymentType_Self-employed    float64
EmploymentType_Unemployed       float64
MaritalStatus_Divorced          float64
MaritalStatus_Married           float64
MaritalStatus_Single            float64
HasMortgage_No                  float64
HasMortgage_Yes                 float64
HasDependents_No                float64
HasDependents_Yes               float64
LoanPurpose_Auto            

In [37]:
print(X_train.columns)

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio',
       'Education_Bachelor's', 'Education_High School', 'Education_Master's',
       'Education_PhD', 'EmploymentType_Full-time', 'EmploymentType_Part-time',
       'EmploymentType_Self-employed', 'EmploymentType_Unemployed',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'HasMortgage_No', 'HasMortgage_Yes',
       'HasDependents_No', 'HasDependents_Yes', 'LoanPurpose_Auto',
       'LoanPurpose_Business', 'LoanPurpose_Education', 'LoanPurpose_Home',
       'LoanPurpose_Other', 'HasCoSigner_No', 'HasCoSigner_Yes',
       'Default_False', 'Default_True'],
      dtype='object')


In [47]:
import pickle
import pandas as pd

# Load the trained model
with open("loan_model.pkl", "rb") as f:
    model = pickle.load(f)

# Function to preprocess input
def preprocess_input(data_dict):
    df = pd.DataFrame([data_dict])  # Convert dictionary to DataFrame

    # One-hot encode categorical columns
    df_encoded = pd.get_dummies(df, columns=['Education', 'EmploymentType', 'MaritalStatus', 
                                             'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner', 'Default'])

    # Expected column order (ensure all required columns exist)
    expected_columns = [
        'Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines',
        'InterestRate', 'LoanTerm', 'DTIRatio',
        "Education_Bachelor's", 'Education_High School', "Education_Master's", 'Education_PhD',
        'EmploymentType_Full-time', 'EmploymentType_Part-time', 'EmploymentType_Self-employed', 'EmploymentType_Unemployed',
        'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single',
        'HasMortgage_No', 'HasMortgage_Yes', 'HasDependents_No', 'HasDependents_Yes',
        'LoanPurpose_Auto', 'LoanPurpose_Business', 'LoanPurpose_Education', 'LoanPurpose_Home', 'LoanPurpose_Other',
        'HasCoSigner_No', 'HasCoSigner_Yes', 'Default_No', 'Default_Yes'
    ]

    # Ensure all columns exist (missing columns are set to 0)
    for col in expected_columns:
        if col not in df_encoded.columns:
            df_encoded[col] = 0

    # Reorder columns to match training data
    df_encoded = df_encoded[expected_columns]
    
    return df_encoded.values

# Predefined "Strong Approval" Input
strong_applicant = {
    "Age": 25,                # Good age for approval
    "Income": 90298,         # High income
    "LoanAmount": 990448,      # Small loan amount relative to income
    "CreditScore": 220,       # Excellent credit score
    "MonthsEmployed": 18,     # Stable employment
    "NumCreditLines": 2,      # Good credit history
    "InterestRate": 22.72,      # Low interest rate
    "LoanTerm": 24,           # Reasonable loan term
    "DTIRatio": round(90448 / 90298, 2),  # Low debt-to-income ratio (20%)
    
    "Education": "High School",  # Higher education level
    "EmploymentType": "Unemployed",  
    "MaritalStatus": "Single",
    "HasMortgage": "Yes",
    "HasDependents": "No",
    "LoanPurpose": "Business",    # Common purpose
    "HasCoSigner": "No",     # Increases approval chances
    "Default": "Yes"           # No default history
}

# Convert 'Default' field properly
strong_applicant["Default"] = "Default_Yes" if strong_applicant["Default"].lower() == "yes" else "Default_No"

# Preprocess input
features = preprocess_input(strong_applicant)

# Make prediction
prediction = model.predict(features)[0]

# Show result
result = "Approved" if prediction == 1 else "Rejected"
print("\nLoan Approval Prediction:", result)



Loan Approval Prediction: Approved


In [188]:
print(df[["LoanApprovalStatus"]].head(10))



  LoanApprovalStatus
0           Approved
1           Approved
2           Rejected
3           Rejected
4           Rejected
5           Rejected
6           Rejected
7           Rejected
8           Approved
9           Approved
