# BBM409 : Introduction to Machine Learning Lab. Assignment 2

#### PROBLEM DEFINITION :

    
In this assignment, we were asked to understand and familiarize with the decision tree algorithm. We will experiment with decision tree model (by using ID3 algorithm) on the Diabetes Risk Prediction dataset.

    
In the first part of the experiment, we implemented a decision tree model for predicting whether a patient is a potential diabetic or not. For this experiment, we split our data into two parts. Training and test.

    
In the second part of the experiment, we pruned the twigs of the tree that we created at the first part. This process prevents overfit our decision tree. For this experiment, we split our data into three parts. Training, validation and test.

In [None]:
!pip install -r requirements.txt

First, we need to install the packages specified in the requirements.txt file.

## Imports

We imported the necessary packages and modules.

In [None]:
from math import log
import numpy as np
import pandas as pd
from numpy.lib.shape_base import column_stack

In [None]:
%matplotlib inline

### Encoder of the Age Attribute Method

In [None]:
def encode_age(column, max, min):
    _range = ((max-min)/5)
    
    intervals = []

    for index in range(1,6):
        temp_max = min+_range

        intervals.append((int(min), int(temp_max)))
        min = temp_max
        #min = temp_max

    encoded = []
    for age in column:
        for index, interval in enumerate(intervals):
            __range = range(interval[0],interval[1]+1)
            if age in __range:
                encoded.append(index)
                break
    
    return encoded
        

### Encoder of the ** Method

In [None]:

def encode_features(data):

    # get columns which includes yes or no as a value
    binary_columns = data.columns.tolist()
    binary_columns.remove("Age")

    # Appling binary encoding to 'Age' column
    ages = data["Age"].tolist()
    max_age = max(ages)
    min_age = min(ages)
    

    encoded_ages = encode_age(ages, max_age, min_age)
  
    # replace age column with encoded version
    data["Age"] = encoded_ages

    for col in binary_columns:
        values_array = np.unique(data[col].tolist())
        if not (("Yes" in values_array[0]) or ("Positive" in values_array[0])):
            # little configuration for giving 1 for Yes and 0 for No 
            values_array = values_array[::-1]
        
        data[col] = data[col].apply(lambda x:1 if x==values_array[0] else 0)

    
    return data 

### Node Class

Node class that creates each leaf or node of the decision tree.

In [None]:
class Node:
    # Feature column index
    value = None
    

    is_leaf = None

    # [feature1, feature2] 
    children = None

    # not None for leaves    
    out_class = None

    def __init__(self, value=None):
        self.value = value
        self.is_leaf = False
        self.children = []


### DecisionTree Class

Decision tree class that makes up fundemental decision tree with the specific methods.

In [None]:
class DecisionTree:



    def __init__(self):
        self.head = Node()

    def calculate_gains(self, set):
        # fetaure based entropies and information gains
        gains = []
        for column in range(set.shape[1]-1):
            information = 0
            for value in np.unique(set[:,column]):
                # filter for attribute value
                temp_filter = (set[:,column] == value)
                
                # using that filter to get attributes
                temp_samples = set[temp_filter]

                # calculate entropy and number of samples
                temp_entropy, temp_pos, temp_neg = self.calculate_entropy(temp_samples)
                
                # calculate ratio of relevant attribute
                temp_ratio = (temp_pos+temp_neg)/(self.num_pos+self.num_neg)
                
                # summing the result with information
                information += (temp_entropy*temp_ratio)
                #print((temp_entropy*temp_ratio))
            
            gain = self.dataset_entropy-information

            #print(f"For column {column}, gain: {gain}")
            gains.append((column, gain))
        
        gains = sorted(gains, key=lambda x:x[1], reverse=True)
        # print(len(gains))
        return gains[0]


    def fit(self, X, y):
        data = np.concatenate((X,y), axis=1)
        dataset_entropy, num_pos, num_neg = self.calculate_entropy(data)

        # setting attributes
        # features
        self.X = X
        # labels
        self.y = y
        # all dataset
        self.data = data
        # number of positive examples
        self.num_pos = num_pos
        # number of negative examples
        self.num_neg = num_neg
        # all dataset's entropy
        self.dataset_entropy = dataset_entropy

        chosen_feature = self.calculate_gains(data) # (chosen, gain)
        print(chosen_feature)



    def predict(self, X):
        pass 


    def calculate_entropy(self, X):
        positives = X[X[:,-1]==1]
        negatives = X[X[:,-1]==0]

        num_pos = positives.shape[0]
        num_neg = negatives.shape[0]
        num_total = num_pos+num_neg
        
        p_ratio = num_pos/num_total
        n_ratio = num_neg/num_total

        log_p = log(p_ratio, 2) if not p_ratio == 0 else 0
        log_n = log(n_ratio, 2) if not n_ratio == 0 else 0

        entropy = (-p_ratio*log_p)-(n_ratio*log_n)

        return entropy, num_pos, num_neg

    def calculate_gain(self, X, feature=None):
        pass

In [None]:
def main():

    data = pd.read_csv("diabetes_data_upload.csv")

    data = encode_features(data)   

    #data.to_csv("test.csv",index=False)
    
    tree = DecisionTree()
    
    data = np.array(data)
    
    
    X = data[:,:-1]
    y = data[:,-1].reshape(-1,1)
    tree.fit(X, y)
    


if __name__ == "__main__":
    main()