In [31]:
import pandas as pd
import math
from collections import Counter

In [32]:
# Load dataset
df = pd.read_csv("loan_approval_dataset.csv")
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB
None


In [33]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [34]:
import pandas as pd
import numpy as np
import math



# -------------------------------
# Step 1: Preprocess numeric columns into categories
# -------------------------------
def discretize(column, bins=3, labels=["Low", "Medium", "High"]):
    return pd.cut(column, bins=bins, labels=labels)

# Apply discretization to numeric features
df['income_annum'] = discretize(df['income_annum'])
df['loan_amount'] = discretize(df['loan_amount'])
df['loan_term'] = discretize(df['loan_term'])
df['cibil_score'] = discretize(df['cibil_score'])
df['residential_assets_value'] = discretize(df['residential_assets_value'])
df['commercial_assets_value'] = discretize(df['commercial_assets_value'])
df['luxury_assets_value'] = discretize(df['luxury_assets_value'])
df['bank_asset_value'] = discretize(df['bank_asset_value'])

# Drop loan_id (not useful for prediction)
df = df.drop(columns=['loan_id'])

# -------------------------------
# Step 2: Entropy Function
# -------------------------------
def entropy(values):
    classes, counts = np.unique(values, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

# -------------------------------
# Step 3: Information Gain
# -------------------------------
def info_gain(df, attribute, target="loan_status"):
    total_entropy = entropy(df[target])
    values, counts = np.unique(df[attribute], return_counts=True)

    weighted_entropy = 0
    for v, c in zip(values, counts):
        subset = df[df[attribute] == v][target]
        weighted_entropy += (c / len(df)) * entropy(subset)

    return total_entropy - weighted_entropy

# -------------------------------
# Step 4: Build Tree Recursively
# -------------------------------
def build_tree(df, target="loan_status", attributes=None):
    # Leaf condition 1: if all values same → return that value
    if len(np.unique(df[target])) == 1:
        return np.unique(df[target])[0]

    # Leaf condition 2: if no attributes left → return majority
    if attributes is None or len(attributes) == 0:
        return df[target].mode()[0]

    # Choose best attribute (highest Info Gain)
    gains = {attr: info_gain(df, attr, target) for attr in attributes}
    best_attr = max(gains, key=gains.get)

    # Build tree as dictionary
    tree = {best_attr: {}}
    for val in df[best_attr].unique():
        subset = df[df[best_attr] == val]
        if subset.empty:
            tree[best_attr][val] = df[target].mode()[0]
        else:
            new_attrs = [a for a in attributes if a != best_attr]
            tree[best_attr][val] = build_tree(subset, target, new_attrs)
    return tree

# -------------------------------
# Step 5: Print Tree (Yes/No style)
# -------------------------------
def print_tree(tree, indent=""):
    root = list(tree.keys())[0]
    for val, branch in tree[root].items():
        if isinstance(branch, dict):
            print(f"{indent}{root} == {val}?")
            print_tree(branch, indent + "  ")
        else:
            print(f"{indent}{root} == {val} -> {branch}")

# -------------------------------
# Run the Tree
# -------------------------------
attributes = [col for col in df.columns if col != "loan_status"]

decision_tree = build_tree(df, target="loan_status", attributes=attributes)
print("\nDecision Tree Structure:\n")
print_tree(decision_tree)


KeyError: 'income_annum'

In [3]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
# -----------------------------
# Step 1: Entropy function
# -----------------------------
def entropy_of_attribute(data):
    counts = Counter(data)
    total = len(data)
    entropy = 0.0
    for label in counts:
        p = counts[label] / total
        entropy -= p * math.log2(p)
    return entropy


In [5]:

# -----------------------------
# Step 2: Information Gain
# -----------------------------
def information_gain(df, attribute, target="Loan_Status"):
    # Entropy before split
    total_entropy = entropy_of_attribute(df[target])

    # Weighted entropy after split
    values = df[attribute].unique()
    weighted_entropy = 0.0
    for val in values:
        subset = df[df[attribute] == val][target]
        weight = len(subset) / len(df)
        weighted_entropy += weight * entropy_of_attribute(subset)

    # Info Gain
    return total_entropy - weighted_entropy


In [6]:
# -----------------------------
# Step 3: Build Decision Tree (ID3)
# -----------------------------
def build_tree(df, target="Loan_Status", attributes=None):
    # If all labels are same → return leaf
    if len(df[target].unique()) == 1:
        return df[target].iloc[0]
    
    # If no attributes left → return majority class
    if attributes is None or len(attributes) == 0:
        return df[target].mode()[0]
    
    # Choose best attribute (highest gain)
    gains = {attr: information_gain(df, attr, target) for attr in attributes}
    best_attr = max(gains, key=gains.get)
    
    tree = {best_attr: {}}
    
    # Split and recurse
    for val in df[best_attr].unique():
        subset = df[df[best_attr] == val].drop(columns=[best_attr])
        new_attributes = [a for a in attributes if a != best_attr]
        tree[best_attr][val] = build_tree(subset, target, new_attributes)
    
    return tree

In [7]:
# -----------------------------
# Step 4: Pretty print tree
# -----------------------------
def print_tree(tree, indent=""):
    root = list(tree.keys())[0]
    branches = tree[root]
    for val, subtree in branches.items():
        if isinstance(subtree, dict):
            print(f"{indent}{root} = {val}:")
            print_tree(subtree, indent + "   ")
        else:
            print(f"{indent}{root} = {val} -> {subtree}")


In [14]:
print(df.columns)


Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


In [8]:

# -----------------------------
# Step 5: Prediction function
# -----------------------------
def predict(tree, sample):
    root = list(tree.keys())[0]
    branches = tree[root]
    value = sample.get(root)
    
    if value not in branches:
        return "Unknown"
    
    subtree = branches[value]
    if isinstance(subtree, dict):
        return predict(subtree, sample)
    else:
        return subtree


In [17]:
# -----------------------------
# Run the Tree
# -----------------------------
attributes = [col for col in df.columns if col != ' loan_status']
#decision_tree = build_tree(df, target="loan_status", attributes=attributes)
decision_tree = build_tree(df, target=' loan_status', attributes=attributes)

print("\nDecision Tree Structure:\n")
print_tree(decision_tree)



Decision Tree Structure:

loan_id = 1 ->  Approved
loan_id = 2 ->  Rejected
loan_id = 3 ->  Rejected
loan_id = 4 ->  Rejected
loan_id = 5 ->  Rejected
loan_id = 6 ->  Rejected
loan_id = 7 ->  Approved
loan_id = 8 ->  Rejected
loan_id = 9 ->  Approved
loan_id = 10 ->  Rejected
loan_id = 11 ->  Approved
loan_id = 12 ->  Rejected
loan_id = 13 ->  Rejected
loan_id = 14 ->  Approved
loan_id = 15 ->  Rejected
loan_id = 16 ->  Approved
loan_id = 17 ->  Approved
loan_id = 18 ->  Approved
loan_id = 19 ->  Approved
loan_id = 20 ->  Approved
loan_id = 21 ->  Rejected
loan_id = 22 ->  Rejected
loan_id = 23 ->  Rejected
loan_id = 24 ->  Approved
loan_id = 25 ->  Rejected
loan_id = 26 ->  Rejected
loan_id = 27 ->  Approved
loan_id = 28 ->  Approved
loan_id = 29 ->  Rejected
loan_id = 30 ->  Approved
loan_id = 31 ->  Rejected
loan_id = 32 ->  Rejected
loan_id = 33 ->  Rejected
loan_id = 34 ->  Approved
loan_id = 35 ->  Approved
loan_id = 36 ->  Approved
loan_id = 37 ->  Approved
loan_id = 38 ->  Rej

In [26]:
sample = {
    "no_of_dependents": 2,
    "education": "Graduate",
    "self_employed": "No",
    "income_annum": 9600000,
    "loan_amount": 29900000,
    "loan_term": 12,
    "cibil_score": 778,
    "residential_assets_value": 2400000,
    "commercial_assets_value": 17600000,
    "luxury_assets_value": 22700000,
    "bank_asset_value": 8000000
}


In [27]:
print("\nPrediction for sample:", predict(decision_tree, sample))


Prediction for sample: Unknown


In [28]:
df.head(4)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
