In [1]:
import pandas as pd
import numpy as np
from math import log
import operator as oper
import os

In [2]:
dataset = pd.read_csv('./tennis.csv', encoding='UTF-8')

In [3]:
dataset.tail()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,PlayTennis
9,Rainy,Mild,High,False,No
10,Sunny,Hot,Normal,True,Yes
11,Overcast,Mild,High,True,Yes
12,Overcast,Hot,Normal,False,Yes
13,Rainy,Mild,High,True,No


In [4]:
dataset_1 = pd.DataFrame(np.array([dataset['Temperature'], \
                    dataset['Humidity'], \
                    dataset['Windy'], \
                    dataset['PlayTennis'], \
                    dataset['Outlook']]).T)

In [5]:
dataset_2 = pd.DataFrame(np.array([dataset['Humidity'], \
                    dataset['Windy'], \
                    dataset['PlayTennis'], \
                    dataset['Outlook'], \
                    dataset['Temperature']]).T)

In [6]:
dataset_3 = pd.DataFrame(np.array([dataset['Windy'], \
                    dataset['PlayTennis'], \
                    dataset['Outlook'], \
                    dataset['Temperature'], \
                    dataset['Humidity']]).T)

In [7]:
dataset_4 = pd.DataFrame(np.array([dataset['PlayTennis'], \
                    dataset['Outlook'], \
                    dataset['Temperature'], \
                    dataset['Humidity'], \
                    dataset['Windy']]).T)

In [8]:
dataset = np.array(dataset)
dataset_1 = np.array(dataset_1)
dataset_2 = np.array(dataset_2)
dataset_3 = np.array(dataset_3)
dataset_4 = np.array(dataset_4)

In [9]:
dataset_4

array([['No', 'Sunny', 'Hot', 'High', False],
       ['No', 'Sunny', 'Cold', 'High', True],
       ['Yes', 'Overcast', 'Hot', 'High', False],
       ['Yes', 'Rainy', 'Mild', 'Normal', False],
       ['Yes', 'Rainy', 'Cool', 'Normal', False],
       ['No', 'Rainy', 'Cool', 'Normal', True],
       ['Yes', 'Overcast', 'Cool', 'Normal', True],
       ['No', 'Sunny', 'Mild', 'High', False],
       ['Yes', 'Sunny', 'Cool', 'Normal', False],
       ['No', 'Rainy', 'Mild', 'High', False],
       ['Yes', 'Sunny', 'Hot', 'Normal', True],
       ['Yes', 'Overcast', 'Mild', 'High', True],
       ['Yes', 'Overcast', 'Hot', 'Normal', False],
       ['No', 'Rainy', 'Mild', 'High', True]], dtype=object)

In [10]:
def create_label_dict(data):
    label_dict = dict()
    for feature in data:
        label = feature[-1]
        if label not in label_dict.keys():
            label_dict[label] = 0
        label_dict[label] += 1
    return label_dict

In [11]:
def entropy(data):
    label_dict = create_label_dict(data)
    
    entropy = 0
    for key in label_dict:
        prob = float(label_dict[key]/len(data))
        entropy -= prob*log(prob, 2)
    
    return entropy

In [12]:
def split_data(data, axis, value):
    new_data = list()
    for feature in data:
        if feature[axis] == value:
            new_feature = list(feature[:axis])
            new_feature.extend(feature[axis+1:])
            new_data.append(new_feature)
    return new_data

In [13]:
def choose_best_feature(data):
    initial_entropy = entropy(data)
    best_info_gain = 0.0
    best_feature = -1
    
    for i in range(len(data[0])-1):
        feature_list = [rows[i] for rows in data]
        unique_values = set(feature_list)
        new_entropy = 0.0
        
        for value in unique_values:
            new_data = split_data(data, i, value)
            prob = len(new_data)/float(len(data))
            new_entropy += prob*entropy(new_data)
        
        info_gain = initial_entropy - new_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_feature = i
    
    return best_feature

In [14]:
def sort_dict(class_dict):
    return sorted(class_dict.iteritems, key=oper.itemgetter(1), reverse=True)[0][0]

In [15]:
def get_most(class_list):
    class_dict = dict()
    for vote in class_list:
        if vote not in class_dict.keys():
            class_dict[vote] = 0
        class_dict[vote] += 1
    
    return sort_dict(class_list)

In [16]:
def create_tree(data, labels):
    class_list = [rows[-1] for rows in data]
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0]
    if len(data[0]) == 1:
        return get_most(class_list)

    best_feature = choose_best_feature(data)
    best_feature_label = labels[best_feature]
    tree = {best_feature_label: dict()}
    
    labels = np.delete(labels, best_feature)
     
    feature_values = [rows[best_feature] for rows in data]
    unique_values = set(feature_values)
    for value in unique_values:
        new_labels = labels[:]
        tree[best_feature_label][value] = create_tree(split_data(data, best_feature, value), new_labels)
    
    return tree

In [17]:
def pretty_print_tree(tree):
    stack = []
    rules = set()
    
    def rule(tree, key, stack, rules):
        if isinstance(tree[key], dict):
            new_str = 'IF ' if not stack else ' and '
            stack.append(new_str + str(key) + ' is ')
            for k in tree[key].keys():
                stack.append(str(k))
                rule(tree[key], k, stack, rules)
                stack.pop()
            stack.pop()
        else:
            stack.append(' THEN ' + str(tree[key]))
            rules.add(''.join(stack))
            stack.pop()
            
    rule(tree, list(tree.keys())[0], stack, rules)
    print(os.linesep.join(rules))

In [18]:
tree = create_tree(dataset, ['Outlook', 'Temperature', 'Humidity', 'Windy', 'PlayTennis'])
tree_1 = create_tree(dataset_1, ['Temperature', 'Humidity', 'Windy', 'PlayTennis', 'Outlook'])
tree_2 = create_tree(dataset_2, ['Humidity', 'Windy', 'PlayTennis', 'Outlook', 'Temperature'])
tree_3 = create_tree(dataset_3, ['Windy', 'PlayTennis', 'Outlook', 'Temperature', 'Humidity'])
tree_4 = create_tree(dataset_4, ['PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Windy'])

In [19]:
tree

{'Outlook': {'Overcast': 'Yes',
  'Rainy': {'Humidity': {'High': 'No',
    'Normal': {'Windy': {False: 'Yes', True: 'No'}}}},
  'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}

In [20]:
pretty_print_tree(tree)

IF Outlook is Sunny and Sunny is Humidity and Humidity is High THEN No
IF Outlook is Overcast THEN Yes
IF Outlook is Rainy and Rainy is Humidity and Humidity is Normal and Normal is Windy and Windy is False THEN Yes
IF Outlook is Sunny and Sunny is Humidity and Humidity is Normal THEN Yes
IF Outlook is Rainy and Rainy is Humidity and Humidity is Normal and Normal is Windy and Windy is True THEN No
IF Outlook is Rainy and Rainy is Humidity and Humidity is High THEN No


In [21]:
tree_1

{'Temperature': {'Cold': 'Sunny',
  'Cool': {'Windy': {False: {'Outlook': {'Rainy': 'Rainy', 'Sunny': 'Sunny'}},
    True: {'PlayTennis': {'No': 'Rainy', 'Yes': 'Overcast'}}}},
  'Hot': {'Windy': {False: {'PlayTennis': {'No': 'Sunny', 'Yes': 'Overcast'}},
    True: 'Sunny'}},
  'Mild': {'Windy': {False: {'Humidity': {'High': {'Outlook': {'Rainy': 'Rainy',
        'Sunny': 'Sunny'}},
      'Normal': 'Rainy'}},
    True: {'PlayTennis': {'No': 'Rainy', 'Yes': 'Overcast'}}}}}}

In [22]:
pretty_print_tree(tree_1)

IF Temperature is Mild and Mild is Windy and Windy is False and False is Humidity and Humidity is High and High is Outlook and Outlook is Sunny THEN Sunny
IF Temperature is Hot and Hot is Windy and Windy is False and False is PlayTennis and PlayTennis is No THEN Sunny
IF Temperature is Mild and Mild is Windy and Windy is False and False is Humidity and Humidity is High and High is Outlook and Outlook is Rainy THEN Rainy
IF Temperature is Cool and Cool is Windy and Windy is True and True is PlayTennis and PlayTennis is Yes THEN Overcast
IF Temperature is Hot and Hot is Windy and Windy is True THEN Sunny
IF Temperature is Cool and Cool is Windy and Windy is True and True is PlayTennis and PlayTennis is No THEN Rainy
IF Temperature is Cold THEN Sunny
IF Temperature is Mild and Mild is Windy and Windy is False and False is Humidity and Humidity is Normal THEN Rainy
IF Temperature is Hot and Hot is Windy and Windy is False and False is PlayTennis and PlayTennis is Yes THEN Overcast
IF Tempe

In [23]:
tree_2

{'Humidity': {'High': {'Windy': {False: {'Outlook': {'Overcast': 'Hot',
      'Rainy': 'Mild',
      'Sunny': {'Temperature': {'Hot': 'Hot', 'Mild': 'Mild'}}}},
    True: {'Outlook': {'Overcast': 'Mild',
      'Rainy': 'Mild',
      'Sunny': 'Cold'}}}},
  'Normal': {'Outlook': {'Overcast': {'Windy': {False: 'Hot', True: 'Cool'}},
    'Rainy': {'Windy': {False: {'Temperature': {'Cool': 'Cool',
        'Mild': 'Mild'}},
      True: 'Cool'}},
    'Sunny': {'Windy': {False: 'Cool', True: 'Hot'}}}}}}

In [24]:
pretty_print_tree(tree_2)

IF Humidity is High and High is Windy and Windy is False and False is Outlook and Outlook is Rainy THEN Mild
IF Humidity is High and High is Windy and Windy is False and False is Outlook and Outlook is Sunny and Sunny is Temperature and Temperature is Hot THEN Hot
IF Humidity is Normal and Normal is Outlook and Outlook is Sunny and Sunny is Windy and Windy is False THEN Cool
IF Humidity is High and High is Windy and Windy is False and False is Outlook and Outlook is Overcast THEN Hot
IF Humidity is Normal and Normal is Outlook and Outlook is Rainy and Rainy is Windy and Windy is False and False is Temperature and Temperature is Mild THEN Mild
IF Humidity is Normal and Normal is Outlook and Outlook is Overcast and Overcast is Windy and Windy is True THEN Cool
IF Humidity is Normal and Normal is Outlook and Outlook is Rainy and Rainy is Windy and Windy is False and False is Temperature and Temperature is Cool THEN Cool
IF Humidity is Normal and Normal is Outlook and Outlook is Rainy and 

In [25]:
tree_3

{'Temperature': {'Cold': 'High',
  'Cool': 'Normal',
  'Hot': {'Windy': {False: {'PlayTennis': {'No': 'High',
      'Yes': {'Humidity': {'High': 'High', 'Normal': 'Normal'}}}},
    True: 'Normal'}},
  'Mild': {'PlayTennis': {'No': 'High',
    'Yes': {'Windy': {False: 'Normal', True: 'High'}}}}}}

In [26]:
pretty_print_tree(tree_3)

IF Temperature is Cold THEN High
IF Temperature is Hot and Hot is Windy and Windy is False and False is PlayTennis and PlayTennis is No THEN High
IF Temperature is Mild and Mild is PlayTennis and PlayTennis is Yes and Yes is Windy and Windy is False THEN Normal
IF Temperature is Hot and Hot is Windy and Windy is False and False is PlayTennis and PlayTennis is Yes and Yes is Humidity and Humidity is High THEN High
IF Temperature is Cool THEN Normal
IF Temperature is Mild and Mild is PlayTennis and PlayTennis is No THEN High
IF Temperature is Hot and Hot is Windy and Windy is True THEN Normal
IF Temperature is Mild and Mild is PlayTennis and PlayTennis is Yes and Yes is Windy and Windy is True THEN High
IF Temperature is Hot and Hot is Windy and Windy is False and False is PlayTennis and PlayTennis is Yes and Yes is Humidity and Humidity is Normal THEN Normal


In [27]:
tree_4

{'Temperature': {'Cold': True,
  'Cool': {'Outlook': {'Overcast': True,
    'Rainy': {'PlayTennis': {'No': True, 'Yes': False}},
    'Sunny': False}},
  'Hot': {'Outlook': {'Overcast': False,
    'Sunny': {'PlayTennis': {'No': False, 'Yes': True}}}},
  'Mild': {'Outlook': {'Overcast': True,
    'Rainy': {'PlayTennis': {'No': {'Windy': {False: False, True: True}},
      'Yes': False}},
    'Sunny': False}}}}

In [28]:
pretty_print_tree(tree_4)

IF Temperature is Mild and Mild is Outlook and Outlook is Sunny THEN False
IF Temperature is Cool and Cool is Outlook and Outlook is Overcast THEN True
IF Temperature is Cool and Cool is Outlook and Outlook is Rainy and Rainy is PlayTennis and PlayTennis is No THEN True
IF Temperature is Mild and Mild is Outlook and Outlook is Rainy and Rainy is PlayTennis and PlayTennis is No and No is Windy and Windy is False THEN False
IF Temperature is Mild and Mild is Outlook and Outlook is Rainy and Rainy is PlayTennis and PlayTennis is No and No is Windy and Windy is True THEN True
IF Temperature is Cold THEN True
IF Temperature is Mild and Mild is Outlook and Outlook is Rainy and Rainy is PlayTennis and PlayTennis is Yes THEN False
IF Temperature is Cool and Cool is Outlook and Outlook is Sunny THEN False
IF Temperature is Mild and Mild is Outlook and Outlook is Overcast THEN True
IF Temperature is Hot and Hot is Outlook and Outlook is Sunny and Sunny is PlayTennis and PlayTennis is Yes THEN Tr