# Question 1

In [166]:
import pandas as pd
import math

# load dataset
data = pd.read_csv("Decision Tree-Dataset.csv")
data

Unnamed: 0,Gender,Smoke,Home City,Stage of Life,BMI Category,Activeness,Illness
0,Male,Yes,London,Elderly,Overweight,Low,Yes
1,Male,No,London,Child,Normal,Medium,Yes
2,Male,No,Bristol,Adult,Overweight,Medium,No
3,Male,Yes,Birmingham,Elderly,Underweight,Medium,Yes
4,Female,Yes,Birmingham,Adult,Overweight,Low,Yes
5,Female,No,London,Elderly,Normal,High,No
6,Male,Yes,London,Adult,Underweight,High,No
7,Female,No,Bristol,Child,Normal,Medium,Yes
8,Female,No,Birmingham,Adult,Normal,Low,No
9,Female,No,Edinburgh,Elderly,Normal,High,No


In [167]:
def calculate_entropy(column, verbose=False):
    # compute the counts of each unique value in the column
    counts = column.value_counts()
    
    # divide by the total column length to get a probability
    probabilities = counts / len(column)

    if verbose:
        print(probabilities)

    entropy = 0

    # loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)
    
    return -entropy

print(f"\nENTROPY: {calculate_entropy(data['Illness'], verbose=True)}")

Yes    0.5
No     0.5
Name: Illness, dtype: float64

ENTROPY: 1.0


In [168]:
def calculate_information_gain(data, split_name, target_name="Illness"):
    # get entropy for column
    original_entropy = calculate_entropy(data[target_name])

    # find unique values in the column
    uniq_values = data[split_name].unique()
    
    # initailize info_gain variable to have the value of entropy, and from
    info_gain = original_entropy

    # split dataset where each subset has a unique value of the attribute
    subsets = [data[data[split_name] == name] for name in uniq_values]
    
    # iterate through the subsets and apply information gain formala
    for subset in subsets:
        # this is the proportion of examples for the unique attribute for the subset
        # e.g. prob = 3/4 if subset is of females and has 3 rows out of 4 rows of genders in the whole dataset
        prob = subset.shape[0] / data[split_name].shape[0]

        # now we subtract this probability multiplied by the subsets entropy by the info_gain 
        info_gain -= (prob * calculate_entropy(subset["Illness"]))
    
    return info_gain

Let's output the information gain for each attribute with respect to "Illness"

In [169]:
print(f"""INFO GAIN (Question 1)\n
Gender:\t\t{round(calculate_information_gain(data, "Gender"), 5)}
Smoke:\t\t{round(calculate_information_gain(data, "Smoke"), 5)}
Home City:\t{round(calculate_information_gain(data, "Home City"), 5)}
Stage of Life:\t{round(calculate_information_gain(data, "Stage of Life"), 5)}
BMI Category:\t{round(calculate_information_gain(data, "BMI Category"), 5)}
Activeness:\t{round(calculate_information_gain(data, "Activeness"), 5)}
""")

INFO GAIN (Question 1)

Gender:		0.02905
Smoke:		0.12451
Home City:	0.12451
Stage of Life:	0.27549
BMI Category:	0.03904
Activeness:	0.4



# Question 2

In [170]:
# load dataset
data_knn = pd.read_csv("kNN-Dataset.csv")
data_knn

Unnamed: 0,Gender,Age,BMI,Home City,Illness
0,Male,33,28.8,Bristol,No
1,Female,45,23.8,London,No
2,Female,68,21.3,Edinburgh,No
3,Male,21,22.6,London,Yes
4,Male,71,18.3,Birmingham,Yes
5,Female,27,28.0,Birmingham,Yes


I will start by creating helper functions

- `cat_input_attrib_calc`:  works out the (x_i - x_j) for categorical attributes (discussed in section 5, 07:40)
- `normalise`:              normalises the input attributes and also applies (x_i - x_j)
- `parse_data`:             decides whether to use `cat_input_attrib_calc` or `normalise` based on the input type

In [171]:
def cat_input_attrib_calc(cat_x, cat_comparason):
    # returns 0 if they are the same, else returns 1 if they are different categories
    return 1 - int(cat_x == cat_comparason)

def normalise(x, min=None, max=None):
    return (x - min)/(max - min)

def parse_data(data_knn, attrib, val, x_new):
    if type(val) == int or type(val) == float:
        # get min and max of the data attribute
        # this would be dynamic depending on the x_new but thankfully the question values
        # does not produce a new max or min
        max = data_knn[attrib].max()
        min = data_knn[attrib].min()

        # get normalised values of val and the x_new and subtract them
        return normalise(val, max=max, min=min) - normalise(x_new[attrib], max=max, min=min)

    else:
        return cat_input_attrib_calc(val, x_new[attrib])

In [172]:
# question 2 values
k = 3
x_new = {"Gender": "Female", "Age":26, "BMI":20, "Home City":"Birmingham"}

In [173]:
# create a dictionary that stores the 
out = {"Gender":[], "Age": [], "BMI": [], "Home City": []}

# iterate through the columns of the table
for attrib in data_knn.columns:
    # iterate through the values of the column
    for val in data_knn[attrib]:
        if attrib in out:
            # append to the dictionary the difference between the x_new and the value in the dataset
            out[attrib].append(parse_data(data_knn, attrib, val, x_new))

out

{'Gender': [1, 0, 0, 1, 1, 0],
 'Age': [0.13999999999999999, 0.38, 0.84, -0.1, 0.9, 0.01999999999999999],
 'BMI': [0.8380952380952382,
  0.361904761904762,
  0.12380952380952387,
  0.24761904761904774,
  -0.16190476190476183,
  0.7619047619047619],
 'Home City': [1, 1, 1, 1, 0, 0]}

In [174]:
def eu_distance(x_new, out):
    # store the result of distance calculated for each row 
    out_ = []

    # iterate through the dataset's rows
    for i in range(data_knn.shape[0]):

        # square each value we calculated in the previous cell
        gender_dist = (out["Gender"][i])**2
        age_dist = (out["Age"][i])**2
        bmi_dist = (out["BMI"][i])**2
        city_dist = (out["Home City"][i])**2

        # take the square root of the sum of these values
        val = math.sqrt(gender_dist + age_dist + bmi_dist + city_dist)
        
        out_.append(round(val, 5))

    return out_

answer = eu_distance(x_new ,out)

In [176]:
print(f"""k Nearest Neighbour (Question 2)\n
distance(x_1, x_new):\t{answer[0]}
distance(x_2, x_new):\t{answer[1]}
distance(x_3, x_new):\t{answer[2]}
distance(x_4, x_new):\t{answer[3]}
distance(x_5, x_new):\t{answer[4]}
distance(x_6, x_new):\t{answer[5]}
""")

k Nearest Neighbour (Question 2)

distance(x_1, x_new):	1.64985
distance(x_2, x_new):	1.12933
distance(x_3, x_new):	1.31184
distance(x_4, x_new):	1.43921
distance(x_5, x_new):	1.35507
distance(x_6, x_new):	0.76217

