# ID3 Trees
scipy


In [15]:
import numpy as np
from scipy.stats import entropy
import pandas as pd

In [16]:
data = {
    'Age': [30, 45, 50, 35, 60, 55, 40, 25, 65, 45],
    'Blood Pressure': ['High', 'Low', 'High', 'Low', 'High', 'Low', 'High', 'Low', 'High', 'Low'],
    'Cholesterol': ['High', 'Normal', 'High', 'Normal', 'High', 'Normal', 'High', 'Normal', 'High', 'Normal'],
    'Diagnosis': ['Sick', 'Healthy', 'Sick', 'Healthy', 'Sick', 'Healthy', 'Sick', 'Healthy', 'Sick', 'Healthy']
}

In [17]:
df = pd.DataFrame(data)

In [18]:
df

Unnamed: 0,Age,Blood Pressure,Cholesterol,Diagnosis
0,30,High,High,Sick
1,45,Low,Normal,Healthy
2,50,High,High,Sick
3,35,Low,Normal,Healthy
4,60,High,High,Sick
5,55,Low,Normal,Healthy
6,40,High,High,Sick
7,25,Low,Normal,Healthy
8,65,High,High,Sick
9,45,Low,Normal,Healthy


In [19]:
def calc_entropy(labels):
    values, counts = np.unique(labels, return_counts=True)
    return entropy(counts, base=2)

In [20]:
def information_gain(data, attribute, target_attribute):
    total_entropy = calc_entropy([x[target_attribute] for x in data])
    values = set(x[attribute] for x in data)
    weighted_entropy = sum((len([x for x in data if x[attribute] == v]) / len(data)) * calc_entropy([x[target_attribute] for x in data if x[attribute] == v]) for v in values)
    return total_entropy - weighted_entropy

In [23]:
# Convert data to list of dictionaries format
data_list = [{'Age': age, 'Blood Pressure': bp, 'Cholesterol': chol, 'Diagnosis': diag} for age, bp, chol, diag in zip(data['Age'], data['Blood Pressure'], data['Cholesterol'], data['Diagnosis'])]

# Calculating information gains for each attribute
attributes = ['Age', 'Blood Pressure', 'Cholesterol']
gains = {attribute: information_gain(data_list, attribute, 'Diagnosis') for attribute in attributes}

# Output the attribute with the maximum information gain
best_attribute = max(gains, key=gains.get)

In [24]:
# Prediction function based on attributes
def predict_diagnosis(age, blood_pressure, cholesterol):
    if blood_pressure == 'Low' and cholesterol == 'Normal':
        return 'Healthy'
    else:
        return 'Sick'  # Default case if no clear classification from attributes

In [26]:
# Predicting for a 50-year-old with low blood pressure and normal cholesterol
patient_diagnosis = predict_diagnosis(50, 'Low', 'Normal')

In [28]:
# Output the results
print("Information Gains:", gains)
print("Best Attribute to split on:", best_attribute)
print("Diagnosis for the patient:", patient_diagnosis)

Information Gains: {'Age': np.float64(1.0), 'Blood Pressure': np.float64(1.0), 'Cholesterol': np.float64(1.0)}
Best Attribute to split on: Age
Diagnosis for the patient: Healthy


# OR

In [1]:
import pandas as pd
import numpy as np 

p_data = pd.read_csv('patient_data.csv')
p_data = p_data.drop('Patient ID', axis=1)
print(p_data)

class Node:
    def __init__(self,):
        self.head = None
        self.ans = None
        self.next = {}
        self.end = False

class Decision_Tree:
    def __init__(self,):
        self.root = None

    def info_gain(self, data, classifier, attribute):
        gain_data = 0
        for x in data[classifier].unique():
            p = len(data[data[classifier] == x]) / len(data)
            gain_data += -p * np.log2(p)
        
        gain_att = 0
        for x in data[attribute].unique():
            val = 0
            for y in data[classifier].unique():
                p = len(data[(data[classifier] == y) & (data[attribute] == x)]) / len(data[data[classifier] == y])
                if p == 0:
                    continue
                val += -p * np.log2(p)
            
            p_x = len(data[data[attribute] == x]) / len(data)
            gain_att += p_x * val

        return gain_data - gain_att

    def split(self, data, classifier):
        if len(data[classifier].unique()) == 1:
            ob = Node()
            ob.end = True
            ob.ans = data[classifier].unique()[0]
            return ob
        else:
            ob = Node()
            att = None
            info_gain = float('-inf')
            for x in data.columns:
                if x == classifier:
                    continue
                val = self.info_gain(data, classifier, x)
                if val > info_gain:
                    info_gain = val
                    att = x
            ob.head = att
            for x in data[att].unique():
                ndata = data[data[att] == x]
                ndata = ndata.drop(att, axis=1)
                ob.next[x] = self.split(ndata, classifier)
            return ob

tree = Decision_Tree()
tree.root = tree.split(p_data, 'Diagnosis')


   Age Blood Pressure Cholesterol Diagnosis
0   30           High        High      Sick
1   45            Low      Normal   Healthy
2   50           High        High      Sick
3   35            Low      Normal   Healthy
4   60           High        High      Sick
5   55            Low      Normal   Healthy
6   40           High        High      Sick
7   25            Low      Normal   Healthy
8   65           High        High      Sick
9   45            Low      Normal   Healthy


In [3]:
u_age = input(f'person age = ')
u_blood_pressure = input(f'person blood pressure = ')
u_cholesterol = input(f'person cholesterol = ')

# dfs traversal to reach leaf node.
def dfs(curr, age, blood, cholest):
    if curr.end:
        return curr.ans
    if curr.head == 'Age':
        return dfs(curr.next[age], age, blood, cholest)
    if curr.head == 'Blood Pressure':
        return dfs(curr.next[blood], age, blood, cholest)
    return dfs(curr.next[cholest], age, blood, cholest)
    
print(dfs(tree.root, u_age, u_blood_pressure, u_cholesterol))

Sick
