# Decision Tree Implementation using `loan.csv`

In [1]:
import pandas as pd
import numpy as np


In [2]:
# Load the dataset
df = pd.read_csv("loan.csv")

# Display the first few rows
df.head()


Unnamed: 0,Income,Loan Amount,Default
0,15,8,No
1,30,8,No
2,5,9,Yes
3,22,10,No
4,33,12,No


In [3]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Drop rows with missing values for simplicity
df = df.dropna()


Missing values:
 Income         0
Loan Amount    0
Default        0
dtype: int64


In [4]:
# Encode categorical variables
for col in df.select_dtypes(include='object').columns:
    df[col] = pd.factorize(df[col])[0]

df.head()


Unnamed: 0,Income,Loan Amount,Default
0,15,8,0
1,30,8,0
2,5,9,1
3,22,10,0
4,33,12,0


In [8]:
# Assume 'Default' is the target
X = df.drop('Default', axis=1)
y = df['Default']


In [9]:
from sklearn.model_selection import train_test_split

# Redefine X and y if necessary
X = df.drop('Default', axis=1)
y = df['Default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

def gini_index(y):
    classes = np.unique(y)
    gini = 1.0
    for cls in classes:
        p = np.sum(y == cls) / len(y)
        gini -= p ** 2
    return gini

def best_split(X, y):
    best_feature, best_threshold = None, None
    best_gain = 0
    parent_gini = gini_index(y)

    for feature in X.columns:
        thresholds = np.unique(X[feature])
        for threshold in thresholds:
            left_idx = X[feature] <= threshold
            right_idx = X[feature] > threshold

            if len(y[left_idx]) == 0 or len(y[right_idx]) == 0:
                continue

            gini_left = gini_index(y[left_idx])
            gini_right = gini_index(y[right_idx])
            n = len(y)
            gain = parent_gini - (len(y[left_idx]) / n * gini_left + len(y[right_idx]) / n * gini_right)

            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold


In [12]:
def build_tree(X, y, depth=0, max_depth=5):
    if len(np.unique(y)) == 1 or depth >= max_depth:
        leaf_value = np.bincount(y).argmax()
        return Node(value=leaf_value)

    feature, threshold = best_split(X, y)
    if feature is None:
        leaf_value = np.bincount(y).argmax()
        return Node(value=leaf_value)

    left_idx = X[feature] <= threshold
    right_idx = X[feature] > threshold

    left = build_tree(X[left_idx], y[left_idx], depth + 1, max_depth)
    right = build_tree(X[right_idx], y[right_idx], depth + 1, max_depth)
    return Node(feature=feature, threshold=threshold, left=left, right=right)

def predict_one(node, x):
    while node.value is None:
        if x[node.feature] <= node.threshold:
            node = node.left
        else:
            node = node.right
    return node.value

def predict(tree, X):
    return np.array([predict_one(tree, row) for _, row in X.iterrows()])


In [13]:
tree = build_tree(X_train, y_train)
preds = predict(tree, X_test)

from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, preds))


Accuracy: 0.6666666666666666
