In [2]:
import numpy as np
import math
import csv

def read_data(filename):
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile, delimiter=',')
        headers = next(datareader)
        metadata = []
        traindata = []
        for name in headers:
            metadata.append(name)
        for row in datareader:
            traindata.append(row)
        return (metadata, traindata)

class Node:
    def __init__(self, attribute):
        self.attribute = attribute
        self.children = []
        self.answer = ""

    def __str__(self):
        return self.attribute

def subtables(data, col, delete):
    dict = {}
    items = np.unique(data[:, col])
    count = np.zeros((items.shape[0], 1), dtype=np.int32)
    for x in range(items.shape[0]):
        for y in range(data.shape[0]):
            if data[y, col] == items[x]:
                count[x] += 1
    for x in range(items.shape[0]):
        dict[items[x]] = np.empty((count[x].item(), data.shape[1]),
                                  dtype="|S32")
        pos = 0
        for y in range(data.shape[0]):
            if data[y, col] == items[x]:
                dict[items[x]][pos] = data[y]
                pos += 1
        if delete:
            dict[items[x]] = np.delete(dict[items[x]], col, 1)
    return items, dict

def entropy(S):
    items = np.unique(S)
    if items.size == 1:
        return 0
    counts = np.zeros((items.shape[0], 1))
    sums = 0
    for x in range(items.shape[0]):
        counts[x] = sum(S == items[x]) / (S.size * 1.0)
    for count in counts:
        sums += -1 * count.item() * math.log(count.item(), 2)
    return sums

def gain_ratio(data, col):
    items, dict = subtables(data, col, delete=False)
    total_size = data.shape[0]
    entropies = np.zeros((items.shape[0], 1))
    intrinsic = np.zeros((items.shape[0], 1))
    for x in range(items.shape[0]):
        ratio = dict[items[x]].shape[0]/(total_size * 1.0)
        entropies[x] = ratio * entropy(dict[items[x]][:, -1])
        intrinsic[x] = ratio * math.log(ratio, 2)
    total_entropy = entropy(data[:, -1])
    iv = -1 * sum(intrinsic)
    if iv == 0:
        return 0
    for x in range(entropies.shape[0]):
        total_entropy -= entropies[x]
    return total_entropy / iv

def create_node(data, metadata):
    if (np.unique(data[:, -1])).shape[0] == 1:
        node = Node("")
        node.answer = np.unique(data[:, -1])[0]
        return node
    gains = np.zeros((data.shape[1] - 1, 1))
    for col in range(data.shape[1] - 1):
        gains[col] = gain_ratio(data, col)
    split = np.argmax(gains)
    node = Node(metadata[split])
    metadata = np.delete(metadata, split, 0)
    items, dict = subtables(data, split, delete=True)
    for x in range(items.shape[0]):
        child = create_node(dict[items[x]], metadata)
        node.children.append((items[x], child))
    return node

def empty(size):
    s = ""
    for x in range(size):
        s += " "
    return s

def print_tree(node, level):
    if node.answer != "":
        print(empty(level), node.answer)
        return
    print(empty(level), node.attribute)
    for value, n in node.children:
        print(empty(level + 1), value)
        print_tree(n, level + 2)

if __name__ == "__main__":
    metadata, traindata = read_data("PlayTennis.csv")
    data = np.array(traindata)
    node = create_node(data, metadata)
    print_tree(node, 0)


 PlayTennis
  0
   b'Weak'
  1
   b'Strong'
  10
   b'Strong'
  11
   b'Strong'
  12
   b'Weak'
  13
   b'Strong'
  14
   b'Weak'
  15
   b'Strong'
  16
   b'Weak'
  17
   b'Strong'
  18
   b'Weak'
  19
   b'Strong'
  2
   b'Weak'
  3
   b'Weak'
  4
   b'Weak'
  5
   b'Strong'
  6
   b'Strong'
  7
   b'Weak'
  8
   b'Weak'
  9
   b'Weak'


In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.tree import DecisionTreeClassifier
# from sklearn import tree
# data = {
# 'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain',
# 'Sunny', 'Overcast', 'Overcast', 'Rain'],
# 'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild',
# 'Hot', 'Mild'],
# 'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal',
# 'Normal', 'Normal', 'High', 'Normal', 'High'],
# 'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak',
# 'Strong', 'Strong', 'Weak', 'Strong'],
# 'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
# }
# df = pd.DataFrame(data)
# # Mapping categorical data to numeric data
# df['Outlook'] = df['Outlook'].map({'Sunny': 0, 'Overcast': 1, 'Rain': 2})
# df['Temperature'] = df['Temperature'].map({'Hot': 0, 'Mild': 1, 'Cool': 2})
# df['Humidity'] = df['Humidity'].map({'High': 0, 'Normal': 1})
# df['Wind'] = df['Wind'].map({'Weak': 0, 'Strong': 1})
# df['PlayTennis'] = df['PlayTennis'].map({'No': 0, 'Yes': 1})
# # Splitting data into features (X) and target (y)
# X = df.drop('PlayTennis', axis=1)
# y = df['PlayTennis']
# # Building the decision tree using the ID3 algorithm (via DecisionTreeClassifier)
# clf = DecisionTreeClassifier(criterion='entropy')
# clf = clf.fit(X, y)
# # Visualizing the decision tree
# tree.plot_tree(clf, feature_names=X.columns, class_names=['No', 'Yes'], filled=True)
# # Predicting for a new sample
# # Example: Outlook=Sunny, Temperature=Cool, Humidity=High, Wind=Strong -> [0, 2, 0, 1]
# new_sample = np.array([[0, 2, 0, 1]])
# prediction = clf.predict(new_sample)
# # Output result
# result = 'Yes' if prediction[0] == 1 else 'No'
# print(f'Prediction for the new sample: PlayTennis = {result}')