In [1]:
import numpy as np
import pandas as pd

In [59]:

# Function to calculate the entropy of a dataset
def entropy(dataset):
    labels = [item[-1] for item in dataset]
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    entropy = 0
    total_samples = len(dataset)

    for count in label_counts:
        probability = count / total_samples
        entropy -= probability * np.log2(probability)

    return entropy

# Function to calculate the information gain
def information_gain(data, feature_index, entropy_before_split):
    unique_values = np.unique([item[feature_index] for item in data])
    total_samples = len(data)
    weighted_entropy = 0

    for value in unique_values:
        subset = [item for item in data if item[feature_index] == value]
        subset_entropy = entropy(subset)
        weight = len(subset) / total_samples
        weighted_entropy += weight * subset_entropy

    return entropy_before_split - weighted_entropy

# Function to find the best split for a given dataset
def find_best_split(data):
    num_features = len(data[0]) - 1
    entropy_before_split = entropy(data)
    best_info_gain = 0
    best_feature = -1

    for feature_index in range(num_features):
        info_gain = information_gain(data, feature_index, entropy_before_split)
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = feature_index

    return best_feature

# Recursive function to build the decision tree
def build_decision_tree(data):
    labels = [item[-1] for item in data]

    # If all labels are the same, return that label
    if labels.count(labels[0]) == len(labels):
        return labels[0]

    # If there are no features left to split on, return the majority label
    if len(data[0]) == 1:
        unique_labels, label_counts = np.unique(labels, return_counts=True)
        return unique_labels[np.argmax(label_counts)]

    # Find the best feature to split on
    best_feature = find_best_split(data)

    # Create a new decision tree node with the best feature
    decision_tree = {best_feature: {}}
    unique_values = np.unique([item[best_feature] for item in data])

    for value in unique_values:
        subset = [item[:best_feature] + item[best_feature+1:] for item in data if item[best_feature] == value]
        decision_tree[best_feature][value] = build_decision_tree(subset)

    return decision_tree


print(data)
# Build the decision tree
decision_tree = build_decision_tree(data)

# Print the decision tree
print(decision_tree)

[['Red', 3, 'Apple'], ['Green', 3, 'Apple'], ['Red', 1, 'Orange'], ['Orange', 3, 'Orange'], ['Red', 2, 'Apple'], ['Orange', 2, 'Orange'], ['Yellow', 3, 'Orange']]
{0: {'Green': 'Apple', 'Orange': 'Orange', 'Red': {0: {1: 'Orange', 2: 'Apple', 3: 'Apple'}}, 'Yellow': 'Orange'}}


In [70]:
def predict_instance(instance, tree):
    if not isinstance(tree, dict):
        return tree  # If it's a leaf node (a class label), return the label

    feature_name = list(tree.keys())[0]  # Get the feature name to split on
    feature_value = instance.get(feature_name)  # Get the feature value from the instance

    if feature_value is not None:
        # Recursively continue down the tree
        return predict_instance(instance, tree[feature_name][feature_value])
    else:
        # Handle unknown values
        return "Unknown"


In [71]:
# Create a simple dataset
data = {
    "credit_rating" : np.random.choice(["poor","good","Excellent"],100),
    "job_type" : np.random.choice(["Govt","Pvt","Bussiness"],100),
#     "age": np.random.randint(21,65,100),
    "loan_approved" : np.random.choice(["approved","not_approved"],100)
}

df = pd.DataFrame(data)

df

Unnamed: 0,credit_rating,job_type,loan_approved
0,poor,Govt,not_approved
1,poor,Govt,not_approved
2,good,Govt,not_approved
3,good,Bussiness,not_approved
4,good,Pvt,approved
...,...,...,...
95,good,Pvt,approved
96,Excellent,Govt,not_approved
97,poor,Pvt,approved
98,good,Govt,not_approved


In [72]:
data_list = df.values.tolist()
data_list

[['poor', 'Govt', 'not_approved'],
 ['poor', 'Govt', 'not_approved'],
 ['good', 'Govt', 'not_approved'],
 ['good', 'Bussiness', 'not_approved'],
 ['good', 'Pvt', 'approved'],
 ['Excellent', 'Govt', 'not_approved'],
 ['Excellent', 'Govt', 'not_approved'],
 ['poor', 'Pvt', 'approved'],
 ['Excellent', 'Pvt', 'not_approved'],
 ['good', 'Govt', 'approved'],
 ['poor', 'Govt', 'approved'],
 ['poor', 'Bussiness', 'not_approved'],
 ['poor', 'Govt', 'not_approved'],
 ['poor', 'Pvt', 'not_approved'],
 ['poor', 'Bussiness', 'approved'],
 ['poor', 'Bussiness', 'not_approved'],
 ['poor', 'Bussiness', 'approved'],
 ['Excellent', 'Pvt', 'not_approved'],
 ['good', 'Bussiness', 'approved'],
 ['poor', 'Bussiness', 'not_approved'],
 ['poor', 'Pvt', 'not_approved'],
 ['Excellent', 'Govt', 'not_approved'],
 ['poor', 'Pvt', 'approved'],
 ['good', 'Bussiness', 'not_approved'],
 ['poor', 'Bussiness', 'not_approved'],
 ['good', 'Pvt', 'approved'],
 ['good', 'Bussiness', 'not_approved'],
 ['good', 'Bussiness', '

In [73]:
data_list
# Build the decision tree
decision_tree = build_decision_tree(data_list)

# Print the decision tree
print(decision_tree)

{0: {'Excellent': {0: {'Bussiness': 'approved', 'Govt': 'not_approved', 'Pvt': 'approved'}}, 'good': {0: {'Bussiness': 'not_approved', 'Govt': 'not_approved', 'Pvt': 'approved'}}, 'poor': {0: {'Bussiness': 'not_approved', 'Govt': 'approved', 'Pvt': 'approved'}}}}


In [75]:
# Example usage for making predictions
sample = {
    "credit_rating": "good",
    "job_type": "Govt"
}


predicted_label = predict_instance(sample, decision_tree)
print(f"Predicted label: {predicted_label}")

Predicted label: Unknown
