In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('play_tennis.csv')
dataset = dataset.drop('day', axis=1)
# Define all the features except the day column
features = dataset.columns[:-1]
# Define the target column
target = dataset.columns[-1]
print("Features = ", features)
print("Target Value = ",target)

Features =  Index(['outlook', 'temp', 'humidity', 'wind'], dtype='object')
Target Value =  play


In [3]:
# Calculate the entropy of a dataset for a target attribute
def entropy(target_col):
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy



In [4]:
# Calculate the information gain (reduction in entropy) that would result by splitting the dataset on the chosen feature
def InfoGain(data,split_attribute_name,target_name="play"):

    # Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])
    # Calculate the values and the corresponding counts for the split attribute 
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)
    # Calculate the weighted entropy
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    # Calculate the information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

In [5]:
# Perform a decision tree classification on the given dataset having continuous values
def decision_tree(data, originaldata, features, target_attribute_name="play", parent_node_class = None):
    # If all target_values have the same value, return this value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    # If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])\
               [np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    # If the feature space is empty, return the mode target feature value of the direct parent node
    elif len(features) ==0:
        return parent_node_class
    # If none of the above holds true, grow the tree!
    else:
        # Set the default value for this node --> The mode target feature value of the current node
        parent_node_class = np.unique(data[target_attribute_name])\
                            [np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]
        # Select the feature which best splits the dataset
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        # Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information
        # gain in the first run
        tree = {best_feature:{}}
        # Remove the feature with the best inforamtion gain from the feature space
        features = [i for i in features if i != best_feature]
        # Grow a branch under the root node for each possible value of the root node feature
        for value in np.unique(data[best_feature]):
            value = value
            # Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            # Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!
            subtree = decision_tree(sub_data,dataset,features,target_attribute_name,parent_node_class)
            # Add the sub tree, grown from the sub_dataset to the tree under the root node
            tree[best_feature][value] = subtree
        return(tree)

In [6]:
# Define the tree structure.
tree = decision_tree(dataset,dataset,features)
# Print the tree
print(tree)


{'temp': {40: 'Yes', 50: 'No', 60: 'Yes', 70: {'outlook': {'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}}, 'Sunny': 'No'}}, 75: 'Yes', 80: 'Yes', 90: {'outlook': {'Overcast': 'Yes', 'Sunny': 'No'}}, 100: 'No'}}


In [7]:
# Call the decision tree function with random values for the features
def predict(query,tree,default = 1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result
            


In [8]:
# Test the decision tree with the following values
query = dataset.iloc[0,:-1].to_dict()
# Predict the target value for the given query
prediction = predict(query,tree,1.0)
print("Prediction for the given query is: ",prediction)
    

Prediction for the given query is:  No


In [10]:
# Another test with different values
query = dataset.iloc[2,:-1].to_dict()
# Predict the target value for the given query
prediction = predict(query,tree,1.0)
print("Prediction for the given query is: ",prediction)


Prediction for the given query is:  Yes
