## Create a decision tree on the Tennis Dataset, you algorithm should be ID3

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data = pd.read_csv('/home/abdullah/Downloads/play_tennis.csv')
data

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


In [3]:
#Drop the day column
data = data.drop(['day'], axis=1)
data

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [4]:
# Create a decision tree on the Tennis Dataset, you algorithm should be ID3
# and should be able to handle both continuous and discrete data.

# Find entropy of class label 
def entropy(target_col):
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

print("Play Entropy:",entropy(data['play']))
print("Outlook Entropy:",entropy(data['outlook']))
print("Temp Entropy:",entropy(data['temp']))
print("Humidity Entropy:",entropy(data['humidity']))
print("Wind Entropy:",entropy(data['wind']))
    


Play Entropy: 0.9402859586706309
Outlook Entropy: 1.5774062828523454
Temp Entropy: 1.5566567074628228
Humidity Entropy: 1.0
Wind Entropy: 0.9852281360342515


In [5]:
# Find entropy of wind for weak and strong
def InfoGain(data,split_attribute_name,target_name="play"):
    total_entropy = entropy(data[target_name])
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

print("Information Gain for Wind",InfoGain(data,'wind','play'))
print("Information Gain for Outlook",InfoGain(data,'outlook','play'))
print("Information Gain for Temp",InfoGain(data,'temp','play'))
print("Information Gain for Humidity",InfoGain(data,'humidity','play'))




Information Gain for Wind 0.04812703040826927
Information Gain for Outlook 0.2467498197744391
Information Gain for Temp 0.029222565658954647
Information Gain for Humidity 0.15183550136234136


In [6]:
# Find the root node
def find_root(data):
    IG = []
    for key in data.keys()[:-1]:
        IG.append(InfoGain(data,key,'play'))
    return data.keys()[:-1][np.argmax(IG)]

print("Root Node is:",find_root(data))

Root Node is: outlook


In [7]:
# Now we will build the decision tree
def buildTree(data,tree=None):
    Class = data.keys()[-1]
    node = find_root(data)
    attValue = np.unique(data[node])
    if tree is None:
        tree={}
        tree[node] = {}
    for value in attValue:
        subtable = data.where(data[node]==value).dropna()
        clValue,counts = np.unique(subtable['play'],return_counts=True)
        if len(counts)==1:
            tree[node][value] = clValue[0]
        else:
            tree[node][value] = buildTree(subtable)
    return tree

In [8]:
buildTree(data)

{'outlook': {'Overcast': 'Yes',
  'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}},
  'Sunny': {'humidity': {'High': 'No', 'Normal': 'Yes'}}}}

## Naive Bayes is better classifier than decision tree and is used on bigger datasets. Decision tree is better for smaller datasets.