In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
dataset = pd.read_csv("BRFSS_Data.csv")

print("Dataset shape: ", dataset.shape)

Dataset shape:  (450016, 358)


In [3]:
#Create a new dataframe using the target attribute BMI
dataBMI = pd.DataFrame()
dataBMI['BMI']=dataset['_BMI5']/100

#Remove missing data
dataBMI = dataBMI.dropna()

#Create labels 1 (overweight) and -1 (normal) based on BMI values
dataBMI.loc[dataBMI.BMI >= 25, 'Label'] = 1
dataBMI.loc[dataBMI.BMI < 25, 'Label'] = -1

dataBMI

Unnamed: 0,BMI,Label
0,26.96,1.0
1,29.43,1.0
2,25.04,1.0
3,26.63,1.0
4,23.30,-1.0
5,27.34,1.0
6,28.06,1.0
8,22.24,-1.0
9,27.12,1.0
10,24.03,-1.0


In [4]:
# C4.5 algorithm for Decision Tree

# Function to partition
def partition(a):
    return {c: (a==c).nonzero()[0] for c in np.unique(a)}

# Function to calculate Entropy (measure of 'purity' )
def entropy(s):
    res = 0
    val, counts = np.unique(s, return_counts=True)
    freqs = counts.astype('float')/len(s)
    for p in freqs:
        if p != 0.0:
            res -= p * np.log2(p)
    return res

# Function to calculate Information Gain: 
# Difference between the entropy of the unsplitted set and the average of the entropy of each split, weighted by the number of elements in the subset
def mutual_information(y, x):

    res = entropy(y)

    # We partition x, according to attribute values x_i
    val, counts = np.unique(x, return_counts=True)
    freqs = counts.astype('float')/len(x)

    # We calculate a weighted average of the entropy
    for p, v in zip(freqs, val):
        res -= p * entropy(y[x == v])

    return res

# Function to check purity
def is_pure(s):
    return len(set(s)) == 1

# Function to recursively split the decision tree
def recursive_split(x, y):
    # If there could be no split, just return the original set
    if is_pure(y) or len(y) == 0:
        return y

    # We get attribute that gives the highest mutual information
    gain = np.array(mutual_information(y, x))
    selected_attr = np.argmax(gain)

    # If there's no gain at all, nothing has to be done, just return the original set
    if np.all(gain < 1e-6):
        return y

    # We split using the selected attribute
    sets = partition(x)

    res = {}
    for k, v in sets.items():
        y_subset = y.take(v, axis=0)
        x_subset = x.take(v, axis=0)

        res["x_%d = %d" % (selected_attr, k)] = recursive_split(x_subset, y_subset)

    return res

In [5]:
X = np.array(dataBMI['BMI'].values).T
y = np.array(dataBMI['Label'].values)
pprint(recursive_split(X, y))

{'x_0 = 0': array([-1.]),
 'x_0 = 109': array([1.]),
 'x_0 = 12': array([-1.]),
 'x_0 = 13': array([-1., -1.]),
 'x_0 = 14': array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1.]),
 'x_0 = 140': array([1.]),
 'x_0 = 15': array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1.]),
 'x_0 = 16': array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1.]),
 'x_0 = 17': array([-1., -1., -1., -1., -1., -1., -1., -1.]),
 'x_0 = 18': array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1