# Loading The Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('loan_data.csv')

# Display basic information
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())


Dataset shape: (45000, 14)

First 5 rows:
   person_age person_gender person_education  person_income  person_emp_exp  \
0        22.0        female           Master        71948.0               0   
1        21.0        female      High School        12282.0               0   
2        25.0        female      High School        12438.0               3   
3        23.0        female         Bachelor        79753.0               0   
4        24.0          male           Master        66135.0               1   

  person_home_ownership  loan_amnt loan_intent  loan_int_rate  \
0                  RENT    35000.0    PERSONAL          16.02   
1                   OWN     1000.0   EDUCATION          11.14   
2              MORTGAGE     5500.0     MEDICAL          12.87   
3                  RENT    35000.0     MEDICAL          15.23   
4                  RENT    35000.0     MEDICAL          14.27   

   loan_percent_income  cb_person_cred_hist_length  credit_score  \
0                 0.49  

# Preprocessing

## Checking for Null values

In [None]:
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64


It seems like we don't have missing values! Let's Continue

## Transforming Categorical features to Numerical features

### Encoding Strategy
1. **Binary Encoding:**  
   - `person_gender`: Convert `female` → 0, `male` → 1  
   - `previous_loan_defaults_on_file`: Convert `No` → 0, `Yes` → 1  

2. **Ordinal Encoding:**  
   - `person_education`: Since education levels have an order, we can map them as:  
     - `High School` → 0  
     - `Bachelor` → 1  
     - `Master` → 2  
     - `PhD` → 3  

3. **One-Hot Encoding:**  
   - `person_home_ownership` (categorical with no order): Convert `RENT`, `OWN`, and `MORTGAGE` into separate binary columns  
   - `loan_intent` (categorical with no order): Convert `PERSONAL`, `EDUCATION`, `MEDICAL`, etc., into separate binary columns  


In [None]:
# Binary Encoding
df["person_gender"] = df["person_gender"].map({"female": 0, "male": 1})
df["previous_loan_defaults_on_file"] = df["previous_loan_defaults_on_file"].map({"No": 0, "Yes": 1})

# Ordinal Encoding
education_mapping = {"High School": 0, "Bachelor": 1, "Master": 2, "PhD": 3}
df["person_education"] = df["person_education"].map(education_mapping)

# One-Hot Encoding
df = pd.get_dummies(df, columns=["person_home_ownership", "loan_intent"])

# Fill any NaN values with 0, then convert to int
df = df.fillna(0).astype(int)

# Display the transformed dataset
print(df.head())

   person_age  person_gender  person_education  person_income  person_emp_exp  \
0          22              0                 2          71948               0   
1          21              0                 0          12282               0   
2          25              0                 0          12438               3   
3          23              0                 1          79753               0   
4          24              1                 2          66135               1   

   loan_amnt  loan_int_rate  loan_percent_income  cb_person_cred_hist_length  \
0      35000             16                    0                           3   
1       1000             11                    0                           2   
2       5500             12                    0                           3   
3      35000             15                    0                           2   
4      35000             14                    0                           4   

   credit_score  ...  person_hom

## Normalization And Standardazation

For a Decision Tree, neither standardization nor normalization is required because:  

✅ **Decision Trees are not affected by feature scaling.** They split data based on feature values, not distances.  
✅ **They handle different feature scales naturally.** Unlike SVM or k-NN, decision trees don’t rely on distance-based calculations.  


# Decision Tree Implementation

In [None]:
import numpy as np
from collections import Counter

class DecisionTree:
    def __init__(self, criterion='entropy', max_depth=None, min_samples_split=2):
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def calculate_entropy(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        entropy = -np.sum([p * np.log2(p) for p in probabilities if p > 0])
        return entropy

    def calculate_gini(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        gini = 1 - np.sum([p**2 for p in probabilities])
        return gini

    def calculate_impurity(self, y, criterion):
        if criterion == 'entropy':
            return self.calculate_entropy(y)
        elif criterion == 'gini':
            return self.calculate_gini(y)
        else:
            raise ValueError("Invalid criterion")

    def information_gain(self, X, y, feature, criterion='entropy'):
        X_feature = X[:, feature]
        sorted_unique = np.unique(X_feature)
        if len(sorted_unique) <= 1:
            return None, 0

        thresholds = (sorted_unique[:-1] + sorted_unique[1:]) / 2
        max_gain = -np.inf
        best_threshold = None

        parent_impurity = self.calculate_impurity(y, criterion)
        n_samples = len(y)

        for threshold in thresholds:
            left_indices = X_feature <= threshold
            right_indices = X_feature > threshold
            y_left = y[left_indices]
            y_right = y[right_indices]

            if len(y_left) == 0 or len(y_right) == 0:
                continue

            left_impurity = self.calculate_impurity(y_left, criterion)
            right_impurity = self.calculate_impurity(y_right, criterion)
            child_impurity = (len(y_left) * left_impurity + len(y_right) * right_impurity) / n_samples
            gain = parent_impurity - child_impurity

            if gain > max_gain:
                max_gain = gain
                best_threshold = threshold

        if best_threshold is None:
            return None, 0
        return best_threshold, max_gain

    def best_split(self, X, y, criterion='entropy'):
        best_feature = None
        best_threshold = None
        best_gain = -np.inf

        for feature in range(X.shape[1]):
            threshold, gain = self.information_gain(X, y, feature, criterion)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

        return best_feature, best_threshold

    def is_pure(self, y):
        return len(set(y)) == 1

    def create_leaf_node(self, y):
        counts = Counter(y)
        majority_class = max(counts, key=counts.get)
        return {'class': majority_class}

    def build_tree(self, X, y, max_depth, min_samples_split, depth=0):
        if (self.max_depth is not None and depth >= max_depth) or (len(y) < min_samples_split) or self.is_pure(y):
            return self.create_leaf_node(y)

        best_feature, best_threshold = self.best_split(X, y, self.criterion)
        if best_feature is None:
            return self.create_leaf_node(y)

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        X_left, y_left = X[left_indices], y[left_indices]
        X_right, y_right = X[right_indices], y[right_indices]

        if len(y_left) == 0 or len(y_right) == 0:
            return self.create_leaf_node(y)

        left_subtree = self.build_tree(X_left, y_left, max_depth, min_samples_split, depth + 1)
        right_subtree = self.build_tree(X_right, y_right, max_depth, min_samples_split, depth + 1)

        return {
            'feature': best_feature,
            'threshold': best_threshold,
            'left': left_subtree,
            'right': right_subtree
        }

    def fit(self, X, y):
        self.tree = self.build_tree(X, y, self.max_depth, self.min_samples_split)

    def _predict_tree(self, x, node):
        if 'class' in node:
            return node['class']
        feature_value = x[node['feature']]
        if feature_value <= node['threshold']:
            return self._predict_tree(x, node['left'])
        else:
            return self._predict_tree(x, node['right'])

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def evaluate(self, y_true, y_pred):
        classes = np.unique(y_true)
        n_classes = len(classes)
        cm = np.zeros((n_classes, n_classes), dtype=int)

        for i in range(len(y_true)):
            true_idx = np.where(classes == y_true[i])[0][0]
            pred_idx = np.where(classes == y_pred[i])[0][0]
            cm[true_idx, pred_idx] += 1

        accuracy = np.trace(cm) / np.sum(cm)

        recalls = []
        precisions = []
        f1_scores = []

        for i in range(n_classes):
            tp = cm[i, i]
            fp = np.sum(cm[:, i]) - tp
            fn = np.sum(cm[i, :]) - tp

            recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
            precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

            recalls.append(recall)
            precisions.append(precision)
            f1_scores.append(f1)

        macro_recall = np.mean(recalls)
        macro_precision = np.mean(precisions)
        macro_f1 = np.mean(f1_scores)

        return {
            'accuracy': accuracy,
            'recall': macro_recall,
            'precision': macro_precision,
            'f1_score': macro_f1
        }

# Training The Model

## Splitting Data

In [None]:
X = df.drop(columns=['loan_status'])  # Features
y = df['loan_status']  # Target variable

# Split data into 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

## Training The Model

In [None]:
# Initialize and train the DecisionTree model
tree = DecisionTree(criterion='entropy', max_depth=5, min_samples_split=2)
tree.fit(X_train, y_train)


## Making Predictions And Evaluating The Model

In [None]:
# Make predictions
y_pred = tree.predict(X_test)

# Evaluate the model
metrics = tree.evaluate(y_test, y_pred)

# Print evaluation metrics
print("Model Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

Model Evaluation Metrics:
accuracy: 0.8972
recall: 0.8309
precision: 0.8628
f1_score: 0.8452


# Using DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the DecisionTreeClassifier
sklearn_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=2, random_state=42)

# Train the model
sklearn_tree.fit(X_train, y_train)

# Make predictions
y_pred_sklearn = sklearn_tree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_sklearn)
precision = precision_score(y_test, y_pred_sklearn)
recall = recall_score(y_test, y_pred_sklearn)
f1 = f1_score(y_test, y_pred_sklearn)

# Print evaluation metrics
print("Sklearn DecisionTreeClassifier Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Sklearn DecisionTreeClassifier Evaluation Metrics:
Accuracy: 0.8972
Precision: 0.8060
Recall: 0.7109
F1 Score: 0.7555


Looks Like my Precision, Recall, and F1 Score is better than Sklearn DecisionTreeClassifier!