# Lab09 : DECISION TREE [ C4.5 AND CART ]

## Q1

Write a python function program to demonstrate the working of the decision tree based  C4.5 algorithms 
without  using  scikit-learn  library.  Use  following  data  set  for  building  the  decision  tree  and  apply  this 
knowledge to classify a new sample.  
The dataset has three attributes: Outlook (Sunny, Overcast, Rainy), Temperature, Humidity and Wind (Weak, 
Strong). The target attribute is Play Tennis (Yes/No). 

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, prediction=None):
        self.feature = feature  
        self.threshold = threshold            
        self.left = left              
        self.right = right            
        self.prediction = prediction            

class SimpleTree:
    def __init__(self):
        self.root = None

    def fit(self, dataset):
        self.root = self._create_tree(dataset) 

    def _calculate_entropy(self, labels):
        values, counts = np.unique(labels, return_counts = True)
        return entropy(counts, base=2)
        #value_counts = labels.value_counts(normalize=True)
        #return -np.sum(value_counts * np.log2(value_counts + 1e-9))

    def _compute_gain(self, dataset, split_feature, target):
        initial_entropy = self._calculate_entropy(dataset[target])
        value_counts = dataset[split_feature].value_counts(normalize=True)
        
        weighted_entropy = sum(value_counts[v] * self._calculate_entropy(dataset[dataset[split_feature] == v][target])
                                for v in value_counts.index)
        
        return initial_entropy - weighted_entropy

    def _find_best_split(self, dataset, target):
        max_gain = -1
        best_feature = None
        
        for feature in dataset.columns[:-1]:
            gain = self._compute_gain(dataset, feature, target)
            if gain > max_gain:
                max_gain = gain
                best_feature = feature
                
        return best_feature

    def _create_tree(self, dataset):
        target = dataset.columns[-1]
        labels = dataset[target]

        if len(labels.unique()) == 1:
            return Node(prediction=labels.iloc[0])

        if len(dataset.columns) == 1:
            return Node(prediction=labels.mode()[0])

        best_feature = self._find_best_split(dataset, target)

        tree_node = Node(feature=best_feature)

        for threshold in dataset[best_feature].unique():
            subset = dataset[dataset[best_feature] == threshold]
            child_node = self._create_tree(subset.drop(columns=[best_feature]))
            if tree_node.left is None:
                tree_node.left = child_node
                tree_node.threshold = threshold
            else:
                tree_node.right = child_node

        return tree_node

    def _classify_instance(self, node, instance):
        if node.prediction is not None:
            return node.prediction
        
        feature_value = instance[node.feature]
        
        if feature_value == node.threshold:
            return self._classify_instance(node.left, instance) if node.left else node.prediction
        else:
            return self._classify_instance(node.right, instance) if node.right else node.prediction

    def predict(self, instance):
        return self._classify_instance(self.root, instance)

data = {
    'Weather': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Breeze': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 
               'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 
             'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

tree = SimpleTree()
tree.fit(df)

new_instance = {
    'Weather': 'Sunny',
    'Temperature': 75,
    'Humidity': 70,
    'Breeze': 'Weak'
}

new_instance_df = pd.DataFrame([new_instance])

prediction = tree.predict(new_instance_df.iloc[0])
print(f"The predicted decision for the new instance is: {prediction}")


The predicted decision for the new instance is: No


## Q2

Write a python function program to demonstrate the working of the decision tree based  CART algorithms 
without using scikit-learn library. Use Q. No. 1  data set for building the decision tree and apply this knowledge 
to classify a new sample.  
The dataset has three attributes: Outlook (Sunny, Overcast, Rainy), Temperature, Humidity and Wind (Weak, 
Strong). The target attribute is Play Tennis (Yes/No).

In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# Sample dataset
data = {
    'Weather': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Breeze': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 
               'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 
             'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

# Convert categorical columns to numeric using one-hot encoding
df_encoded = pd.get_dummies(df.drop('Play', axis=1))
target = df['Play']

# Initialize and fit the decision tree classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(df_encoded, target)

# New data sample for prediction
new_data = {
    'Weather': 'Sunny',
    'Temperature': 75,
    'Humidity': 70,
    'Breeze': 'Weak'
}

new_data_df = pd.DataFrame([new_data])

# One-hot encode the new data to match the training set's encoded format
new_data_encoded = pd.get_dummies(new_data_df)
missing_cols = set(df_encoded.columns) - set(new_data_encoded.columns)
for col in missing_cols:
    new_data_encoded[col] = 0

# Ensure the column order matches
new_data_encoded = new_data_encoded[df_encoded.columns]

# Predict the outcome for the new data
prediction = decision_tree.predict(new_data_encoded)
print(f"The predicted decision for the new sample is: {prediction[0]}")


The predicted decision for the new sample is: Yes
