In [1]:
import numpy as np
import pandas as pd

In [2]:
# create csv of dataset
data = """
pid,age,bp,cholestrol,diagnosis
1,30,high,high,sick
2,45,low,normal,healthy
3,50,high,high,sick
4,35,low,normal,healthy
5,60,high,high,sick
6,55,low,normal,healthy
7,40,high,high,sick
8,25,low,normal,healthy
9,65,high,high,sick
10,45,low,normal,healthy
"""

# write dataset to csv
with open("medical_records.csv", "w") as f:
    f.write(data.strip())

records = pd.read_csv("./medical_records.csv", header=0)

In [3]:
print(records['age'])
print(np.unique(records['age']))

0    30
1    45
2    50
3    35
4    60
5    55
6    40
7    25
8    65
9    45
Name: age, dtype: int64
[25 30 35 40 45 50 55 60 65]


In [4]:
# define entropy function
def entropy(series):
    """returns entropy"""
    counts = np.unique(series, return_counts=True)[1]
    total = np.sum(counts)

    result = 0
    for count in counts:
        p = count / total
        result -= p * np.log2(p)

    return result

In [5]:
# compute entropy of target variable
entropy(records['diagnosis'])

np.float64(1.0)

In [6]:
def information_gain(df, split_col):
    """Returns the information gain accrued from splitting along a column.
    
    Example usage:
    >>> information_gain(df, 'age')
    """
    keys, counts = np.unique(df[split_col], return_counts=True)
    n = sum(counts)

    new_entropy = 0
    for i, x in enumerate(keys):
        new_entropy += (counts[i] / n) * entropy(df[df[split_col] == x].iloc[:, -1])
    
    return entropy(df.iloc[:, -1]) - new_entropy

[Information Gain explained](https://medium.com/@ompramod9921/decision-trees-6a3c05e9cb82)

In [7]:
def information_gain(df, parent, split_column):
    features, feature_count = np.unique(df[split_column], return_counts=True)
    total = sum(feature_count)
    child_entropy = 0
    for i, x in enumerate(features):
        child_entropy += (feature_count[i] / total) * entropy(df[df[split_column] == x].iloc[:, -1])
    return entropy(df[parent]) - child_entropy

In [8]:
# Calculate the information gain for each feature (Age, Blood Pressure, Cholesterol).
print(information_gain(records, 'diagnosis', 'age'))
print(information_gain(records, 'diagnosis', 'bp'))
print(information_gain(records, 'diagnosis', 'cholestrol'))

1.0
1.0
1.0


In [16]:
from collections import defaultdict


def build_tree(df):
    if len(np.unique(df.iloc[:, -1])) == 1:
        return df.iloc[0, -1]

    if len(df.columns) == 1:
        return df.iloc[:, -1].mode()[0]

    best_column, best_gain = None, 0
    for col in df.columns[:-1]:
        current_information_gain = information_gain(df, 'diagnosis', col)
        if current_information_gain >= best_gain:
            best_column, best_gain = col, current_information_gain

    tree = defaultdict(None)
    for value in np.unique(df[best_column]):
        print(tree)
        subset = df[df[best_column] == value].drop(best_column, axis=1)
        tree[value] = build_tree(subset)
        tree["column_name"] = best_column

    return tree

def predict(root, D):
    node = root
    while isinstance(node, dict):
        print(node)
        column = node["column_name"]
        node = node[D[column]]
    return node

In [17]:
tree = build_tree(records)
D = {'age': 50, 'bp': 'low', 'cholestrol': 'high'}
predict(tree, D)

defaultdict(None, {})
defaultdict(None, {'high': 'sick', 'column_name': 'cholestrol'})
defaultdict(None, {'high': 'sick', 'column_name': 'cholestrol', 'normal': 'healthy'})


'sick'