In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt 
import pandas as pd
import common as cm

# Part 1: Information Gain

Important note: this exercise uses Pandas (for data manipulation and analysis) and Graphviz (for graph-drawing) libraries. 

This exercise consists of 3 parts. Complete the first part to get a mark of 3.0, the first two parts to get 4.0, complete all assignments to get 5.0. 

1.1 ) There are 10 objects (data) characterized with 5 binary attributes:

In [2]:
attributeNames = ["attr 1", "attr 2", "attr 3", "attr 4", "attr 5"]

data = pd.DataFrame(
    [
        [1, 0, 1, 1, 1],
        [1, 1, 0, 0, 1],
        [0, 1, 1, 1, 1],
        [1, 0, 1, 0, 1],
        [1, 0, 0, 1, 1],
        [0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 0, 0, 1, 1],
        [0, 1, 0, 0, 1],
        [0, 0, 0, 1, 1],
    ],
    columns=attributeNames,
)

1.2) Each object is assigned to either a class "0" or "1". The assignments are as follows (cl):

In [3]:
data["cl"] = [1, 1, 1, 0, 0, 1, 1, 1, 0, 0]

Hint: How one can read data (columns) in Pandas:

In [4]:
print(data["cl"], "\n")
print(list(data["cl"]), "\n")
print(set(data["cl"]), "\n")
print(data["attr 1"], "\n")

0    1
1    1
2    1
3    0
4    0
5    1
6    1
7    1
8    0
9    0
Name: cl, dtype: int64 

[1, 1, 1, 0, 0, 1, 1, 1, 0, 0] 

{0, 1} 

0    1
1    1
2    0
3    1
4    1
5    0
6    1
7    1
8    0
9    0
Name: attr 1, dtype: int64 



Hint: How to split data (Pandas DataFrame) base on column:

In [5]:
data[data['cl']==0]

Unnamed: 0,attr 1,attr 2,attr 3,attr 4,attr 5,cl
3,1,0,1,0,1,0
4,1,0,0,1,1,0
8,0,1,0,0,1,0
9,0,0,0,1,1,0


Hint: How to take values from column (Pandas Series):

In [6]:
for id, row in data['cl'].items():
    print(id,row)

0 1
1 1
2 1
3 0
4 0
5 1
6 1
7 1
8 0
9 0


1.3 )  Finish the below function for calculating entropy. $H(CL) = - \sum_{y \in CL}p(y)log_2p(y)$ It should return a value of entropy for an input vector CL. Assume that $log_2(0)$ is equal to 0.

In [7]:
def getEntropy(cl) -> float:
    entropy = 0.0
    count0 = cl.count(0)
    count1 = cl.count(1)
    entropy -= count0/len(cl) * np.log2(count0/len(cl))
    entropy -= count1/len(cl) * np.log2(count1/len(cl))

    return entropy

1.4 ) Calculate the entropy for the CL vector  (the result should be 0.97095...):

In [8]:
print(getEntropy(list(data["cl"])))

0.9709505944546686


1.5) Finish the below function for calculating a conditional entropy: $H(CL|X) = - \sum_{x \in X} \sum_{y \in CL} p(x,y) log_2 \frac{p(x,y)}{p(x)}$. Assume that $log_2(0)$ is equal to 0 and if $p(x) = 0$, $\frac{p(x,y)}{p(x)}$ is equal to 0 as well.

In [9]:
def getConditionalEntropy(cl, attr) -> float:
    entropy = 0.0
    print(cl)
    print(attr == 0)

    return entropy

1.6 ) Calculate conditional entropies for given attribiutes.

In [10]:
print(getConditionalEntropy(data["cl"], data["attr 1"])) ### the result should be 0.95097...
print(getConditionalEntropy(data["cl"], data["attr 5"])) ### the result should be 0.97095...

0    1
1    1
2    1
3    0
4    0
5    1
6    1
7    1
8    0
9    0
Name: cl, dtype: int64
0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8     True
9     True
Name: attr 1, dtype: bool
0.0
0    1
1    1
2    1
3    0
4    0
5    1
6    1
7    1
8    0
9    0
Name: cl, dtype: int64
0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: attr 5, dtype: bool
0.0


1.7 ) **Question: Which entropy is lesser and why?**

1.8) Finish the below function for calculating information gain (use getEntropy() and getConditionalEntropy() functions):

In [11]:
def getInformationGain(cl, attr):
    ### TODO
    ### return 0.0

IndentationError: expected an indented block (Temp/ipykernel_5420/2268696749.py, line 3)

In [None]:
print(getInformationGain(data["cl"], data["attr 1"]))
print(getInformationGain(data["cl"], data["attr 5"]))

1.9) **Question: Which IG is lesser and why?**

# Part 2: ID3 algorithm

Decision tree consists of decision nodes and leaves. Nodes split data while leaves classify objects. Consider the class "Node" provided below. It consists of 4 fields:
- attr - attribute ID (use the names in attributeNames vector)
- left - left branch, i.e., a reference to other node
- right - right branch, i.e., a reference to other node
- value - a decision. If node = None, then the node is not a leaf. If value is not None, then a node is considered a leaf. 

Method __call__ returns the decision if the node is a leaf (i.e., when value is not None). 
Otherwise, it calls either the left or the right branch of an input object, based on the attribute value (0 -> left children; 1 -> right children). In this way, we can traverse the decision tree in order to find the final decision.

In [None]:
class Node:
    def __init__(self, attr, left, right, value):
        self.attr = attr
        self.left = left
        self.right = right
        self.value = value

    def __call__(self, obj):
        if self.value is None:
            if obj[self.attr] == 0:
                return self.left(obj)
            else:
                return self.right(obj)
        else:
            return self.value
        
### EXAMPLE
def example(obj):
    root = Node(0, None, None, None) ###  IN ROOT SPLIT ON 1ST (0) ATTRIBUTE
    lChildren = Node(1, None, None, None) ### IN ROOT's LEFT CHILDREN SPLIT ON 2ND (1) ATTRIBUTE
    rChildren = Node(None, None, None, 2) ### IN ROOT's RIGHT CHILDREN -> DECISION = 2
    root.left = lChildren
    root.right = rChildren
    llChildren = Node(None, None, None, 3) ### IN ROOT's LEFT-LEFT CHILDREN -> DECISION = 3
    lrChildren = Node(None, None, None, 4) ### IN ROOT's LEFT-RIGHT CHILDREN -> DECISION = 4
    lChildren.left = llChildren
    lChildren.right = lrChildren
    print(root(obj))
    
example([0, 0]) ### ROOT : FIRST ATTRIBUTE = 0 SO WE GO TO LEFT CHILDREN.
### IT IS A LEAF WITH THE DECISION = 3
### THEN, IN THE CORRESPONDING CHILDREN, THE SECOND ATTRIBUTE = 0, SO WE GO TO LEFT-LEFT CHILDREN

example([0, 1]) 
example([1, 0])
example([1, 1])

2.1) Create an initial root. Set the value (decision) to 1. 

In [None]:
### TODO

2.2) Use a getErrorRate method in common.py auxiliary file to calculate the error rate. The decision is made based on the majority rule. In case of tie, the method takes 0 as the default class.

In [None]:
### TODO
## SHOULD BE 0.4

2.3) Use printGraph method (see the common.py file) to draw the decision tree and save it in a png file.

In [None]:
### TODO

2.4) Calculate information gain for all attribiutes.

In [None]:
def printInformationGain(data):
    for attribute_name in attributeNames:
        ### TODO
        ### print()
        
printInformationGain(data)

2.5) Choose the best attribute to split the data (HINT, it should be the third attribute :)). Construct two new nodes: one for $x_i$ = 0 decision and the second for $x_i$ = 1; connect them with the root (left and right branch). Remember to update the root. 

In [None]:
### TODO

2.6) Print the graph and calculate the error rate. What happened with the error rate?

In [None]:
### TODO

2.7) Split the 'data' (table) based on the selected attribiute, i.e., create two new tables.

In [None]:
### TODO
### left_data = 
### right_data = 

2.8) Let us start with the left node. Firstly, calculate information gain for this node.

In [None]:
### TODO

2.9) Choose the best attribute to split the data and then update the decision tree.

In [None]:
### TODO

2.10) Print the graph and calculate the error rate (HINT: should be 0.2 :). What happened with the error rate?

In [None]:
### TODO

2.11) Split data (remember that we split left_data, not data).

In [None]:
### TODO
### leftLeft_data = 
### leftRight_data = 

2.12) Repeat the whole process for the right node.

In [None]:
# TODO compute the information gain

In [None]:
# TODO update the decision tree

In [None]:
# TODO print the decision tree and calculate the error rate (HINT, should be 0.1:) )

In [None]:
# TODO split the data (right_data)
#rightLeft_data
#rightRight_data

2.13) Let's consider left-left node. Calculate information gain for it.

In [None]:
# TODO

2.14) Will adding a new node to the tree improve its effectiveness? Why? Why not?

2.15) Calculate information gain for the left-right node.

In [None]:
printInformationGain(leftRight_data)

In [None]:
### Select the attribute and update the tree

In [None]:
### Print the decision tree and compute the error rate

2.16) What happened with the error rate? Is it necessary to keep these two newly added leaves?

2.17) Finish creating the right side of the tree

In [None]:
### TODO

# Part 3: automated construction of decision trees

3.1 Complete the following function for automated construct of decision trees, so that it returns a decision tree for the given data and attribute list. Note that this is a recusive method, i.e., calls itself.

In [None]:
max_depth = 0

def createTree(data, attributeNames, depth=0):
    data = data.reset_index().drop("index", axis=1)
    ### TODO

3.2) Build a decision tree for a training dataset in the common.py auxiliary file, for diffrent values of max_depth.  Calculate & compare the error rates for training and validation datasets.

In [None]:
max_depth = 10

In [None]:
# Training dataset
train_attributeNames, train_data = cm.getTrainingDataSet()
### TODO

In [None]:
# Validation dataset
valid_attributesName, valid_data = cm.getValidationDataSet()
### TODO

3.3) Consider only the training data set and answer the following questions:
* What is the miximum depth of the tree (consider only the training data set)?
* The tree building process should stop when there is no improvement in error rate (why?). Check for which value of "max_dept" there is no improvement in error rate. 

In [None]:
for i in range(10):
    max_depth = i
    ### TODO