In [265]:
# Importing libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [267]:
# Loading the dataset
data = pd.read_csv("dataset/loan_data.csv")
original_data = data.copy()

In [268]:
# Analysing the dataset
print(f"DATA PREVIEW: \n{data.head()}\n")
print(f"MISSING VALUES: \n{data.isnull().sum()}\n")
print(f"DATATYPES: \n{data.dtypes}\n")
print(f"CLASS BALANCE: \n{data["LoanApprovalStatus"].value_counts(normalize = True)}\n")

DATA PREVIEW: 
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced         Yes           Yes       Other   
1    

### One-Hot Encoding

In [272]:
# Identify categorical columns
cat_columns = data.select_dtypes(include=['object']).columns.tolist()

# Exclude LoanID from categorical columns before encoding
cat_columns = [col for col in cat_columns if col != "LoanID"]

print(f"Categorical columns: {cat_columns}")

Categorical columns: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner', 'LoanApprovalStatus']


In [274]:
# One-Hot Encoding for categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cat_features = encoder.fit_transform(data[cat_columns])

# Convert encoded categorical features into a DataFrame
encoded_df = pd.DataFrame(encoded_cat_features, columns=encoder.get_feature_names_out(cat_columns))

# Drop original categorical columns and concatenate encoded data
data = data.drop(columns=cat_columns).reset_index(drop=True)
data = pd.concat([data, encoded_df], axis=1)

print(f"✅ Categorical encoding complete! Updated data preview:\n{data.head()}")

✅ Categorical encoding complete! Updated data preview:
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio  ...  HasDependents_Yes  \
0               4         15.23        36      0.44  ...                1.0   
1               1          4.81        60      0.68  ...                0.0   
2               3         21.17        24      0.31  ...                1.0   
3               3          7.07        24      0.23  ...                0.0   
4               4          6.51        48      0.73  ...                1.0   

   LoanPurpose_Auto  LoanPurpose_Business  

### Feature Scaling

In [277]:
# Identify numerical columns
num_columns = original_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f"Numerical Columns before scaling: {num_columns}")

# Initialize StandardScaler
scaler = StandardScaler()

# Apply StandardScaler only to numerical columns
data[num_columns] = scaler.fit_transform(data[num_columns])

print("✅ Feature scaling complete! Updated data preview:")
print(data.head())


Numerical Columns before scaling: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Default']
✅ Feature scaling complete! Updated data preview:
       LoanID       Age    Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96  0.833990  0.089693   -1.086833    -0.341492        0.590533   
1  HPSK72WA7R  1.701221 -0.823021   -0.044309    -0.731666       -1.285731   
2  C1OZ6DPJ8Y  0.166888  0.043854    0.022715    -0.775718       -0.968209   
3  V2KKSFM3UN -0.767053 -1.303452   -1.168538     1.061875       -1.718715   
4  EY08JDHTZP  1.100830 -1.592855   -1.671921     0.369631       -1.487790   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio  ...  HasDependents_Yes  \
0        1.341937      0.261771 -0.001526 -0.260753  ...                1.0   
1       -1.343791     -1.308350  1.412793  0.778585  ...                0.0   
2        0.446694      1.156831 -0.708685 -0.823728  ...                1.0 

### Splitting Data

In [292]:
# 🚀 Step 7: Drop LoanID (if it exists)
if "LoanID" in data.columns:
    data = data.drop(columns=["LoanID"])

# 🚀 Step 8: Extract Target Column
y = data["LoanApprovalStatus_Approved"]  # Use the encoded column
X = data.drop(columns=["LoanApprovalStatus_Approved", "LoanApprovalStatus_Rejected"])  # Drop both encoded target columns

# 🚀 Step 9: Split into Train, Validation, and Test Sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"✅ Data splitting complete! Training samples: {X_train.shape[0]}, Validation samples: {X_val.shape[0]}, Test samples: {X_test.shape[0]}")


✅ Data splitting complete! Training samples: 178742, Validation samples: 38302, Test samples: 38303


In [217]:
pip install --upgrade xgboost

Note: you may need to restart the kernel to use updated packages.


In [238]:
# XGBoost Model
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 🚀 Step 1: Feature Scaling (XGBoost handles it well, but for consistency)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 🚀 Step 2: Train the XGBoost Model
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",  # Since it's a binary classification problem
    eval_metric="logloss",  # Helps prevent overfitting
    learning_rate=0.1, 
    n_estimators=100, 
    max_depth=5,
    subsample=0.8, 
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], early_stopping_rounds=10, verbose=True)

# 🚀 Step 3: Model Evaluation
y_pred = xgb_model.predict(X_test_scaled)

# 🚀 Step 4: Print Performance Metrics
print(f"✅ Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"✅ Precision: {precision_score(y_test, y_pred):.4f}")
print(f"✅ Recall: {recall_score(y_test, y_pred):.4f}")
print(f"✅ F1 Score: {f1_score(y_test, y_pred):.4f}")


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [143]:
### Training various models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=50, min_samples_leaf=25),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=50, min_samples_leaf=25),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Print results
print("\nModel Performance:")
for model, metrics in results.items():
    print(f"\n{model}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")


Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training AdaBoost...




Training XGBoost...


Parameters: { "use_label_encoder" } are not used.




Model Performance:

Logistic Regression:
Accuracy: 0.9422
Precision: 0.9242
Recall: 0.8915
F1 Score: 0.9075

Decision Tree:
Accuracy: 0.9986
Precision: 0.9976
Recall: 0.9978
F1 Score: 0.9977

Random Forest:
Accuracy: 0.9982
Precision: 0.9990
Recall: 0.9953
F1 Score: 0.9971

AdaBoost:
Accuracy: 0.9628
Precision: 0.9290
Recall: 0.9562
F1 Score: 0.9424

XGBoost:
Accuracy: 0.9993
Precision: 0.9987
Recall: 0.9990
F1 Score: 0.9988


In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define models with tuned hyperparameters
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, penalty='l2'),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=50, min_samples_leaf=25),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=8, min_samples_split=100, min_samples_leaf=50),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, subsample=0.7, colsample_bytree=0.7, reg_lambda=10, reg_alpha=2, eval_metric="logloss")
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    
    # Perform 5-fold cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    
    # Fit model on full training set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {
        "Test Accuracy": accuracy,
        "CV Accuracy": cv_scores.mean(),
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Print results
print("\n📊 Model Performance:")
for model, metrics in results.items():
    print(f"\n🔹 {model}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Feature Importance for Tree-Based Models
print("\n📌 Feature Importances:")
for name, model in models.items():
    if hasattr(model, "feature_importances_"):
        importance = model.feature_importances_
        top_features = sorted(zip(X_train.columns, importance), key=lambda x: x[1], reverse=True)[:10]  # Top 10 features
        print(f"\n🔹 {name}:")
        for feature, score in top_features:
            print(f"{feature}: {score:.4f}")


Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training AdaBoost...




Training XGBoost...

📊 Model Performance:

🔹 Logistic Regression:
Test Accuracy: 0.9422
CV Accuracy: 0.9422
Precision: 0.9242
Recall: 0.8915
F1 Score: 0.9075

🔹 Decision Tree:
Test Accuracy: 0.9938
CV Accuracy: 0.9943
Precision: 0.9891
Recall: 0.9913
F1 Score: 0.9902

🔹 Random Forest:
Test Accuracy: 0.9802
CV Accuracy: 0.9817
Precision: 0.9993
Recall: 0.9383
F1 Score: 0.9679

🔹 AdaBoost:
Test Accuracy: 0.9610
CV Accuracy: 0.9599
Precision: 0.9184
Recall: 0.9630
F1 Score: 0.9402

🔹 XGBoost:
Test Accuracy: 0.9983
CV Accuracy: 0.9984
Precision: 0.9972
Recall: 0.9975
F1 Score: 0.9974

📌 Feature Importances:

🔹 Decision Tree:
LoanPurpose_Other: 0.5378
DTIRatio: 0.2178
CreditScore: 0.1197
LoanPurpose_Education: 0.0350
LoanPurpose_Auto: 0.0329
LoanPurpose_Business: 0.0155
LoanPurpose_Home: 0.0133
LoanAmount: 0.0087
Default: 0.0070
LoanTerm: 0.0061

🔹 Random Forest:
LoanPurpose_Other: 0.4856
CreditScore: 0.1554
DTIRatio: 0.1337
LoanPurpose_Home: 0.0845
LoanPurpose_Business: 0.0577
LoanPurpose_