# Installation of Pandas and Ucimlrepo

In [1]:
#!pip install pandas
#!pip install ucimlrepo
#%pip install matplotlib


In [2]:
import pandas as pd
from ucimlrepo import fetch_ucirepo 
import numpy as np
import math
import matplotlib.pyplot as plt
import string
from pprint import pprint
import random

# Breast Cancer Classification Tree

In [4]:
# Fetch dataset my version
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# Data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets
z = breast_cancer_wisconsin_diagnostic.targets

# Place the target variable as the last column of your dataset
data = pd.DataFrame(data=X, columns=z)
data['Diagnosis'] = y

# Split the dataset function
def split_data(data, test_size):
    data = data.sample(frac=1)  # Shuffle the data
    train_data = data.iloc[:int(len(data)*(1-test_size))]
    test_data = data.iloc[int(len(data)*(1-test_size)):]
    return train_data, test_data

# Convert all feature columns to strings for consistency
data = data.astype(str)

# Split the data into 80% training and 20% testing
train_data, test_data = split_data(data, 0.2)

# Function to calculate entropy
def calculate_entropy(data):
    target_col = data['Diagnosis']
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = sum([-counts[i] / sum(counts) * np.log2(counts[i] / sum(counts)) for i in range(len(elements))])
    return entropy

# Function to calculate information gain
def calculate_information_gain(data, attribute):
    total_entropy = calculate_entropy(data)
    vals, counts = np.unique(data[attribute], return_counts=True)
    weighted_entropy = sum(
        (counts[i] / sum(counts)) * calculate_entropy(data.where(data[attribute] == vals[i]).dropna())
        for i in range(len(vals))
    )
    information_gain = total_entropy - weighted_entropy
    return information_gain

# ID3 Decision Tree Algorithm
def id3(data, original_data, features, target_attribute_name="Diagnosis", parent_node_class=None):
    # Check if all target values are the same
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    # If dataset is empty, return the mode of the target attribute in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_attribute_name])[np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])]
    
    # If features are empty, return the mode of the target attribute in the dataset
    elif len(features) == 0:
        return parent_node_class
    
    # If none of the above, create the tree recursively
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        item_values = [calculate_information_gain(data, feature) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        # Create the tree structure with the best feature as the root
        tree = {best_feature: {}}
        features = [i for i in features if i != best_feature]
        
        # Grow tree branches for each value of the best feature
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, original_data, features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree
            
        return tree

# Train the ID3 decision tree on training data
features = train_data.columns[:-1]  # Exclude the target column
decision_tree = id3(train_data, train_data, features, target_attribute_name="Diagnosis")

# Pretty-print the generated decision tree
pprint(decision_tree)

# Function to predict using the decision tree
def predict(query, tree, default='M'):  # Set default to 'M' or 'B' as needed
    for key in list(query.keys()):
        if key in tree.keys():
            try:
                result = tree[key][query[key]]
            except KeyError:
                return default
            
            # If the result is a subtree, continue down the tree
            if isinstance(result, dict):
                return predict(query, result, default)
            else:
                return result
    return default  # Return the default if no key is found in the tree

# Test the decision tree on the test data
def test_decision_tree(test_data, tree):
    queries = test_data.iloc[:, :-1].to_dict(orient="records")
    predicted = pd.Series([predict(query, tree) for query in queries]).reset_index(drop=True)
    accuracy = np.sum(predicted == test_data['Diagnosis'].reset_index(drop=True)) / len(test_data)
    return accuracy

# Calculate accuracy
accuracy = test_decision_tree(test_data, decision_tree)
print("Accuracy of ID3 Decision Tree:", accuracy)

# Apply the decision tree to the test dataset
predictions = test_data.apply(lambda x: predict(x, decision_tree), axis=1)
print("\nPredictions:")
print(predictions)

# Optional: Compare predictions to actual diagnosis
print("\nActual Values:")
print(test_data['Diagnosis'])

# Function to calculate accuracy
def calculate_accuracy(predictions, actual):
    correct = sum(predictions[i] == actual[i] for i in range(len(predictions)))
    return correct / len(predictions)

# Print accuracy result
print("\nAccuracy:", calculate_accuracy(predictions.tolist(), test_data['Diagnosis'].tolist()))
print(f"\nTest Accuracy: {accuracy*100:.2f}%")


{'smoothness2': {'0.001713': 'B',
                 '0.002667': 'M',
                 '0.002826': 'M',
                 '0.002838': 'B',
                 '0.002866': 'M',
                 '0.003139': 'M',
                 '0.003245': 'B',
                 '0.003271': 'B',
                 '0.00328': 'M',
                 '0.00329': 'M',
                 '0.003308': 'B',
                 '0.003338': 'B',
                 '0.00335': 'M',
                 '0.003418': 'B',
                 '0.003443': 'B',
                 '0.003457': 'B',
                 '0.003492': 'B',
                 '0.003495': 'B',
                 '0.003535': 'B',
                 '0.003629': 'B',
                 '0.003632': 'B',
                 '0.003653': 'B',
                 '0.003659': 'M',
                 '0.003681': 'B',
                 '0.003704': 'B',
                 '0.003741': 'B',
                 '0.003818': 'B',
                 '0.003828': 'B',
                 '0.003872': 'M',
                 

# Mushroom Classification Tree

In [35]:
# Fetch mushroom dataset
mushroom = fetch_ucirepo(id=73)

# Extract features and target variable
X = mushroom.data.features  # Attributes
y = mushroom.data.targets   # Target variable
z = mushroom.targets        # Feature names

# Create a DataFrame and add the target column ('poisonous') as the last column
data = pd.DataFrame(data=X, columns=z)
data['poisonous'] = y

# Convert all feature columns to strings to ensure consistent data types
data = data.astype(str)

# Function to split the dataset into training and testing sets
def split_data(data, test_size):
    data = data.sample(frac=1)  # Shuffle the data
    train_data = data.iloc[:int(len(data)*(1-test_size))]  # 80% training data
    test_data = data.iloc[int(len(data)*(1-test_size)):]   # 20% testing data
    return train_data, test_data

# Split the data into 80% training and 20% testing
train_data, test_data = split_data(data, 0.2)

# Function to calculate entropy
def calculate_entropy(data):
    target_col = data['poisonous']
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = sum([-counts[i]/sum(counts) * np.log2(counts[i]/sum(counts)) for i in range(len(elements))])
    return entropy

# Function to calculate information gain
def calculate_information_gain(data, attribute):
    total_entropy = calculate_entropy(data)
    vals, counts = np.unique(data[attribute], return_counts=True)
    weighted_entropy = sum((counts[i]/sum(counts)) * calculate_entropy(data.where(data[attribute] == vals[i]).dropna()) for i in range(len(vals)))
    information_gain = total_entropy - weighted_entropy
    return information_gain

# ID3 Decision Tree Algorithm
def id3(data, original_data, features, target_attribute_name="poisonous", parent_node_class=None):
    # Check if all target values are the same
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    # If dataset is empty, return the mode of the target attribute in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_attribute_name])[np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])]
    
    # If features are empty, return the mode of the target attribute in the dataset
    elif len(features) == 0:
        return parent_node_class
    
    # If none of the above, create the tree recursively
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        item_values = [calculate_information_gain(data, feature) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        # Create the tree structure with the best feature as the root
        tree = {best_feature: {}}
        features = [i for i in features if i != best_feature]
        
        # Grow tree branches for each value of the best feature
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, original_data, features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree
            
        return tree

# Train the ID3 decision tree on training data
features = train_data.columns[:-1]  # Exclude the target column
decision_tree = id3(train_data, train_data, features)

# Pretty-print the generated decision tree
pprint(decision_tree)

# Function to predict using the decision tree
def predict(query, tree, default='poisonous'):
    for key in list(query.keys()):
        if key in tree.keys():
            try:
                result = tree[key][query[key]]
            except:
                return default
            
            # If the result is not a subtree, return the result
            if isinstance(result, dict):
                return predict(query, result)
            else:
                return result

# Test the decision tree on the test data
def test_decision_tree(test_data, tree):
    queries = test_data.iloc[:, :-1].to_dict(orient="records")
    predicted = pd.Series([predict(query, tree) for query in queries])
    predicted.index = test_data.index  # Align indices
    accuracy = np.sum(predicted == test_data['poisonous']) / len(test_data)
    return accuracy

# Calculate accuracy
accuracy = test_decision_tree(test_data, decision_tree)
print("Accuracy of ID3 Decision Tree:", accuracy)

# Apply the decision tree to the test dataset
predictions = test_data.apply(lambda x: predict(x, decision_tree), axis=1)
print("\nPredictions:")
print(predictions)

# Optional: Compare predictions to actual diagnosis
print("\nActual Values:")
print(test_data['poisonous'])

def accuracy(predictions, actual):
    correct = 0
    for i in range(len(predictions)):
        if predictions[i] == actual[i]:
            correct += 1
    return correct / len(predictions)

print("\nAccuracy:")
print(accuracy(predictions.tolist(), test_data['poisonous'].tolist()))
print(f"\nTest Accuracy: {accuracy(predictions.tolist(), test_data['poisonous'].tolist())*100:.2f}%")


{'odor': {'a': 'e',
          'c': 'p',
          'f': 'p',
          'l': 'e',
          'm': 'p',
          'n': {'spore-print-color': {'b': 'e',
                                      'h': 'e',
                                      'k': 'e',
                                      'n': 'e',
                                      'o': 'e',
                                      'r': 'p',
                                      'w': {'habitat': {'d': {'gill-size': {'b': 'e',
                                                                            'n': 'p'}},
                                                        'g': 'e',
                                                        'l': {'cap-color': {'c': 'e',
                                                                            'n': 'e',
                                                                            'w': 'p',
                                                                            'y': 'p'}},
                          

# Letter Recognition Regression Tree

In [10]:
# Fetch the Letter Recognition dataset
letter_recognition = fetch_ucirepo(id=59)

# Data (as pandas dataframes)
X = letter_recognition.data.features  # Features (attributes)
y = letter_recognition.data.targets   # Target variable (lettr)
z = letter_recognition.data.feature_names  # Attribute names

# Place the target variable ('lettr') column as the last column in your data
data = pd.DataFrame(data=X, columns=z)
data['lettr'] = y  # Add the target variable to the dataframe

# Function to split the dataset into training and testing sets
def split_data(data, test_size):
    data = data.sample(frac=1)  # Shuffle the data
    train_data = data.iloc[:int(len(data) * (1 - test_size))]
    test_data = data.iloc[int(len(data) * (1 - test_size)):]
    return train_data, test_data

# Split the data into 80% training and 20% testing
train_data, test_data = split_data(data, 0.2)
X_train, y_train = train_data.iloc[:, :-1], train_data[['lettr']]
X_test, y_test = test_data.iloc[:, :-1], test_data[['lettr']]

# Select a random subset of 5 features
all_features = X_train.columns
selected_features = random.sample(list(all_features), 3)
print("Selected Features:", selected_features)

# Function to calculate variance for categorical target
def calculate_variance(data):
    target_col = data['lettr']
    variance = np.var(target_col.astype('category').cat.codes)
    return variance

# Function to calculate variance reduction for a split
def calculate_variance_reduction(data, attribute):
    total_variance = calculate_variance(data)
    vals, counts = np.unique(data[attribute], return_counts=True)
    weighted_variance = sum(
        (counts[i] / sum(counts)) * calculate_variance(data.where(data[attribute] == vals[i]).dropna())
        for i in range(len(vals))
    )
    variance_reduction = total_variance - weighted_variance
    return variance_reduction

# ID3 Regression Tree Algorithm
def id3_regression(data, original_data, features, target_attribute_name="lettr", parent_node_value=None):
    # If all target values have the same value, return that value
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    
    # If dataset is empty, return the mode of the target attribute in the original dataset
    elif len(data) == 0:
        return original_data[target_attribute_name].mode()[0]
    
    # If features are empty, return the mode of the target attribute in the dataset
    elif len(features) == 0:
        return parent_node_value
    
    # If none of the above, create the tree recursively
    else:
        parent_node_value = data[target_attribute_name].mode()[0]
        item_values = [calculate_variance_reduction(data, feature) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        # Create the tree structure with the best feature as the root
        tree = {best_feature: {}}
        features = [i for i in features if i != best_feature]
        
        # Grow tree branches for each value of the best feature
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3_regression(sub_data, original_data, features, target_attribute_name, parent_node_value)
            tree[best_feature][value] = subtree
            
        return tree

# Train the ID3 regression tree on the selected 5 features
regression_tree = id3_regression(pd.concat([X_train[selected_features], y_train], axis=1), pd.concat([X_train[selected_features], y_train], axis=1), selected_features)

# Pretty-print the generated regression tree
pprint(regression_tree)

# Function to predict using the regression tree
def predict_regression(query, tree, default='A'):
    for key in list(query.keys()):
        if key in tree.keys():
            try:
                result = tree[key][query[key]]
            except KeyError:
                return default
            
            # If the result is a subtree, continue down the tree
            if isinstance(result, dict):
                return predict_regression(query, result, default)
            else:
                return result
    return default  # Return the default if no key is found in the tree

# Test the regression tree on the test data
def test_regression_tree(test_data, tree):
    queries = test_data[selected_features].to_dict(orient="records")
    predicted = pd.Series([predict_regression(query, tree) for query in queries], index=test_data.index)
    return predicted

# Predict the values for the test set
predictions = test_regression_tree(X_test, regression_tree)

# Print actual vs predicted values
print("\nActual vs Predicted:")
print(pd.DataFrame({'Actual': y_test['lettr'], 'Predicted': predictions}))

# Calculate accuracy
accuracy = np.sum(predictions == y_test['lettr']) / len(y_test)
print("\nAccuracy of ID3 Regression Tree:", accuracy)
print(f"\nTest Accuracy: {accuracy*100:.2f}%")


Selected Features: ['xybar', 'yegvx', 'x-bar']
{'x-bar': {np.int64(0): {'xybar': {np.float64(0.0): 'L',
                                   np.float64(1.0): 'L',
                                   np.float64(11.0): 'F',
                                   np.float64(12.0): 'F',
                                   np.float64(13.0): 'F'}},
           np.int64(1): {'xybar': {np.float64(0.0): 'L',
                                   np.float64(1.0): 'L',
                                   np.float64(9.0): 'W',
                                   np.float64(10.0): 'W',
                                   np.float64(11.0): {'yegvx': {np.float64(5.0): 'W',
                                                                np.float64(6.0): 'F',
                                                                np.float64(7.0): 'F'}},
                                   np.float64(12.0): {'yegvx': {np.float64(5.0): 'F',
                                                                np.float64(6.0): 'F',
  

# Adaboost and Random Forest Letter Recognition

In [11]:
import numpy as np
import pandas as pd
import random
from pprint import pprint

# Parameters for AdaBoost and Random Forest
NUM_TREES = 4  # Number of trees for ensemble
NUM_FEATURES = 4  # Number of random features per tree

# Fetch the Letter Recognition dataset
letter_recognition = fetch_ucirepo(id=59)

# Data (as pandas dataframes)
X = letter_recognition.data.features  # Features (attributes)
y = letter_recognition.data.targets   # Target variable (lettr)
z = letter_recognition.data.feature_names  # Attribute names

# Place the target variable ('lettr') column as the last column in your data
data = pd.DataFrame(data=X, columns=z)
data['lettr'] = y  # Add the target variable to the dataframe

# Function to split the dataset into training and testing sets
def split_data(data, test_size):
    data = data.sample(frac=1)  # Shuffle the data
    train_data = data.iloc[:int(len(data) * (1 - test_size))]
    test_data = data.iloc[int(len(data) * test_size):]
    return train_data, test_data

# Split the data into 80% training and 20% testing
train_data, test_data = split_data(data, 0.2)
X_train, y_train = train_data.iloc[:, :-1], train_data[['lettr']]
X_test, y_test = test_data.iloc[:, :-1], test_data[['lettr']]

all_features = X_train.columns
selected_features = random.sample(list(all_features), 3)


# Function to calculate variance for categorical target
def calculate_variance(data):
    target_col = data['lettr']
    variance = np.var(target_col.astype('category').cat.codes)
    return variance

# Function to calculate variance reduction for a split
def calculate_variance_reduction(data, attribute):
    total_variance = calculate_variance(data)
    vals, counts = np.unique(data[attribute], return_counts=True)
    weighted_variance = sum(
        (counts[i] / sum(counts)) * calculate_variance(data.where(data[attribute] == vals[i]).dropna())
        for i in range(len(vals))
    )
    variance_reduction = total_variance - weighted_variance
    return variance_reduction

# ID3 Regression Tree Algorithm
def id3_regression(data, original_data, features, target_attribute_name="lettr", parent_node_value=None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data) == 0:
        return original_data[target_attribute_name].mode()[0]
    elif len(features) == 0:
        return parent_node_value
    else:
        parent_node_value = data[target_attribute_name].mode()[0]
        item_values = [calculate_variance_reduction(data, feature) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        
        tree = {best_feature: {}}
        features = [i for i in features if i != best_feature]
        
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3_regression(sub_data, original_data, features, target_attribute_name, parent_node_value)
            tree[best_feature][value] = subtree
            
        return tree

# Function to predict using the regression tree
def predict_regression(query, tree, default='A'):
    for key in list(query.keys()):
        if key in tree.keys():
            try:
                result = tree[key][query[key]]
            except KeyError:
                return default
            if isinstance(result, dict):
                return predict_regression(query, result, default)
            else:
                return result
    return default

# Test the regression tree on the test data
def test_regression_tree(test_data, tree):
    queries = test_data.to_dict(orient="records")
    predicted = pd.Series([predict_regression(query, tree) for query in queries], index=test_data.index)
    return predicted

# AdaBoost Implementation with Index Alignment Fix
def adaboost(train_data, num_trees=NUM_TREES):
    weights = pd.Series(np.ones(len(train_data)) / len(train_data), index=train_data.index)
    classifiers = []
    alphas = []
    
    for i in range(num_trees):
        # Sample according to weights
        train_data_weighted = train_data.sample(frac=1, weights=weights, replace=True).reset_index(drop=True)
        features = random.sample(list(X_train.columns), NUM_FEATURES)
        
        # Train weak learner
        tree = id3_regression(train_data_weighted, train_data_weighted, features)
        
        # Calculate errors and alpha
        predictions = test_regression_tree(train_data_weighted[features], tree)
        misclassified = (predictions != train_data_weighted['lettr']).astype(int)
        
        # Calculate error rate with weighted misclassifications
        error = np.sum(weights * misclassified) / np.sum(weights)
        
        # If error is 0 or 1, stop to avoid division by zero or extreme values in alpha calculation
        if error == 0 or error == 1:
            break
            
        alpha = 0.5 * np.log((1 - error) / (error + 1e-10))
        alphas.append(alpha)
        classifiers.append(tree)
        
        # Update weights with the correct alignment
        weights = weights * np.exp(alpha * misclassified)
        weights /= weights.sum()  # Normalize weights to sum to 1
    
    return classifiers, alphas

# Function to predict using AdaBoost
def adaboost_predict(classifiers, alphas, X):
    results = pd.DataFrame()
    for i, tree in enumerate(classifiers):
        predictions = test_regression_tree(X, tree)
        results[i] = predictions.map(lambda x: alphas[i] if x == 'A' else -alphas[i])
    return results.sum(axis=1).map(lambda x: 'A' if x > 0 else 'B')  # Example of majority voting

# Random Forest Implementation
def random_forest(train_data, num_trees=NUM_TREES):
    classifiers = []
    
    for i in range(num_trees):
        # Bootstrap sample and feature subset
        sample_data = train_data.sample(frac=0.8, replace=True)
        features = random.sample(list(X_train.columns), NUM_FEATURES)
        
        # Train tree
        tree = id3_regression(sample_data, sample_data, features)
        classifiers.append(tree)
    
    return classifiers

# Function to predict using Random Forest
def random_forest_predict(classifiers, X):
    results = pd.DataFrame()
    for i, tree in enumerate(classifiers):
        results[i] = test_regression_tree(X, tree)
    return results.mode(axis=1)[0]  # Majority voting

# Train AdaBoost and Random Forest models


classifiers_adaboost, alphas = adaboost(train_data)
classifiers_rf = random_forest(train_data)

# Test predictions
print("\nAdaBoost Predictions:")
adaboost_preds = adaboost_predict(classifiers_adaboost, alphas, X_test[selected_features])
print(adaboost_preds)

print("\nRandom Forest Predictions:")
rf_preds = random_forest_predict(classifiers_rf, X_test[selected_features])
print(rf_preds)

# Calculate AdaBoost accuracy
adaboost_accuracy = np.sum(adaboost_preds == y_test['lettr']) / len(y_test)
print("\nAdaBoost Accuracy:", adaboost_accuracy)

print("\nActual vs Predicted:")
print(pd.DataFrame({'Actual': y_test['lettr'], 'AdaBoost Predicted': adaboost_preds, 'Random Forest Predicted': rf_preds}))

# Calculate Random Forest accuracy
rf_accuracy = np.sum(rf_preds == y_test['lettr']) / len(y_test)
print("\nRandom Forest Accuracy:", rf_accuracy)
print(f"\nTest Accuracy: {accuracy*100:.2f}%")


AdaBoost Predictions:
717      A
3512     A
7666     A
2434     A
1464     A
        ..
19518    A
12025    A
16719    A
12370    A
10269    A
Length: 16000, dtype: object

Random Forest Predictions:
717      A
3512     A
7666     A
2434     A
1464     A
        ..
19518    A
12025    A
16719    A
12370    A
10269    A
Name: 0, Length: 16000, dtype: object

AdaBoost Accuracy: 0.0399375

Actual vs Predicted:
      Actual AdaBoost Predicted Random Forest Predicted
717        J                  A                       A
3512       B                  A                       A
7666       T                  A                       A
2434       F                  A                       A
1464       Z                  A                       A
...      ...                ...                     ...
19518      G                  A                       A
12025      C                  A                       A
16719      E                  A                       A
12370      T                

# Letter Recognition Classification Tree

In [36]:
# Fetch the Letter Recognition dataset
letter_recognition = fetch_ucirepo(id=59)

# Data (as pandas DataFrames)
X = letter_recognition.data.features  # Features
y = letter_recognition.data.targets   # Target variable (lettr)
z = letter_recognition.data.feature_names  # Attribute names

# Combine features and target into a single DataFrame
data = pd.DataFrame(data=X, columns=z)
data['lettr'] = y  # Add the target variable to the DataFrame

# Function to split the dataset into training and testing sets
def split_data(data, test_size):
    data = data.sample(frac=1, random_state=42)  # Shuffle with a fixed seed for reproducibility
    train_data = data.iloc[:int(len(data) * (1 - test_size))]
    test_data = data.iloc[int(len(data) * test_size):]
    return train_data, test_data

# Split the data into 80% training and 20% testing
train_data, test_data = split_data(data, 0.2)
X_train, y_train = train_data.iloc[:, :-1], train_data[['lettr']]
X_test, y_test = test_data.iloc[:, :-1], test_data[['lettr']]

# Select a random subset of 5 features
all_features = X_train.columns
selected_features = random.sample(list(all_features), 5)
print("Selected Features:", selected_features)

# Calculate entropy
def entropy(data, target_col):
    values, counts = np.unique(data[target_col], return_counts=True)
    entropy = -sum((counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(values)))
    return entropy

# Calculate information gain
def info_gain(data, split_attribute, target_col):
    total_entropy = entropy(data, target_col)
    values, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == values[i]], target_col)
                           for i in range(len(values)))
    return total_entropy - weighted_entropy

# ID3 algorithm to build the classification tree
def id3(data, original_data, features, target_col, parent_node=None):
    unique_targets = np.unique(data[target_col])

    # If all target values are the same, return that value
    if len(unique_targets) == 1:
        return unique_targets[0]

    # If the dataset is empty, return the mode target in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_col])[np.argmax(np.unique(original_data[target_col], return_counts=True)[1])]

    # If there are no more features to split on, return the parent node
    elif len(features) == 0:
        return parent_node

    # Otherwise, select the best feature to split on
    else:
        # Set the default value for the parent node (majority class)
        parent_node = np.unique(data[target_col])[np.argmax(np.unique(data[target_col], return_counts=True)[1])]
        
        # Select the feature with the highest information gain
        item_gains = [info_gain(data, feature, target_col) for feature in features]
        best_feature_index = np.argmax(item_gains)
        best_feature = features[best_feature_index]
        
        # Create the tree structure
        tree = {best_feature: {}}
        
        # Remove the best feature from the list of features
        features = [f for f in features if f != best_feature]
        
        # Grow a branch under each value of the best feature
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, original_data, features, target_col, parent_node)
            tree[best_feature][value] = subtree
        
        return tree

# Build the tree using the ID3 algorithm with the selected subset of features
target_col = 'lettr'
tree = id3(train_data, train_data, selected_features, target_col)
print("\nID3 Classification Tree for Letter Recognition Dataset:")
pprint(tree)

# Prediction function to classify new data based on the tree
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    feature = next(iter(tree))
    value = instance[feature]
    if value in tree[feature]:
        return predict(tree[feature][value], instance)
    else:
        return None  # Handle unseen feature values gracefully

# Evaluate the model on the test data
def evaluate(tree, test_data, target_col):
    predictions = test_data.apply(lambda x: predict(tree, x), axis=1)
    actual_values = test_data[target_col]
    accuracy = (predictions == actual_values).mean() * 100
    print("\nPredicted vs Actual:")
    for pred, actual in zip(predictions, actual_values):
        print(f"Predicted: {pred}, Actual: {actual}")
    print(f"\nAccuracy: {accuracy:.2f}%")
    return accuracy

# Test and evaluate the tree
accuracy = evaluate(tree, test_data, target_col)
print(f"\nTest Accuracy: {accuracy:.2f}%")


Selected Features: ['y-box', 'high', 'y2bar', 'xegvy', 'onpix']

ID3 Classification Tree for Letter Recognition Dataset:
{'xegvy': {np.int64(1): {'y-box': {np.float64(9.0): 'Q',
                                   np.float64(10.0): 'M',
                                   np.float64(11.0): 'M',
                                   np.float64(12.0): 'M',
                                   np.float64(13.0): 'M',
                                   np.float64(14.0): 'M',
                                   np.float64(15.0): 'M'}},
           np.int64(2): {'y-box': {np.float64(7.0): 'K',
                                   np.float64(9.0): 'M',
                                   np.float64(10.0): 'M',
                                   np.float64(11.0): 'A',
                                   np.float64(12.0): 'M',
                                   np.float64(13.0): {'y2bar': {np.float64(1.0): 'A',
                                                                np.float64(2.0): 'M'}},
          

# Ecoli Regression Tree

In [32]:
import numpy as np
import pandas as pd
from pprint import pprint

# Fetch the E. coli dataset
ecoli = fetch_ucirepo(id=39)

# Data (as pandas dataframes)
X = ecoli.data.features  # Features (attributes)
y = ecoli.data.targets    # Target variable (class)
z = ecoli.targets         # Attribute names

# Place the target variable (class) column as the last column in your data
data = pd.DataFrame(data=X, columns=z)
data['class'] = y  # Add the target variable to the dataframe

# Function to discretize continuous variables
def discretize(data, bins=5):
    for col in data.columns:
        if data[col].dtype != 'object':  # If the column is numeric
            data[col] = pd.cut(data[col], bins, labels=False)
    return data

# Discretize continuous features
data = discretize(data)

# Split the dataset into training and testing sets
def split_data(data, test_size):
    data = data.sample(frac=1, random_state=42)  # Shuffle the data
    train_data = data.iloc[:int(len(data) * (1 - test_size))]
    test_data = data.iloc[int(len(data) * (1 - test_size)):]
    return train_data, test_data

# Split the data into 80% training and 20% testing
train_data, test_data = split_data(data, 0.2)
X_train, y_train = train_data.iloc[:, :-1], train_data[['class']]
X_test, y_test = test_data.iloc[:, :-1], test_data[['class']]

# Function to calculate entropy for a dataset
def entropy(data, target_attribute='class'):
    values, counts = np.unique(data[target_attribute], return_counts=True)
    entropy = sum((-counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(values)))
    return entropy

# Function to calculate information gain for a split
def information_gain(data, attribute, target_attribute='class'):
    total_entropy = entropy(data, target_attribute)
    values, counts = np.unique(data[attribute], return_counts=True)
    
    weighted_entropy = sum((counts[i] / np.sum(counts)) * entropy(data[data[attribute] == values[i]], target_attribute) for i in range(len(values)))
    return total_entropy - weighted_entropy

# ID3 Classification Tree Algorithm
def id3_classification(data, original_data, features, target_attribute="class", parent_node_value=None, max_depth=10, depth=0):
    # If all target values have the same value, return that value
    if len(np.unique(data[target_attribute])) <= 1:
        return np.unique(data[target_attribute])[0]
    
    # If dataset is empty, return the most common value in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_attribute])[np.argmax(np.unique(original_data[target_attribute], return_counts=True)[1])]
    
    # If no more features or maximum depth is reached, return the most common value
    elif len(features) == 0 or depth == max_depth:
        return np.unique(data[target_attribute])[np.argmax(np.unique(data[target_attribute], return_counts=True)[1])]
    
    else:
        parent_node_value = np.unique(data[target_attribute])[np.argmax(np.unique(data[target_attribute], return_counts=True)[1])]
        # Find the feature with the highest information gain
        info_gains = [information_gain(data, feature, target_attribute) for feature in features]
        best_feature_index = np.argmax(info_gains)
        best_feature = features[best_feature_index]
        
        # Create the tree structure with the best feature as the root
        tree = {best_feature: {}}
        remaining_features = [i for i in features if i != best_feature]
        
        # Grow tree branches for each value of the best feature
        for value in np.unique(data[best_feature]):
            subset = data[data[best_feature] == value]
            subtree = id3_classification(subset, original_data, remaining_features, target_attribute, parent_node_value, max_depth, depth + 1)
            tree[best_feature][value] = subtree
            
        return tree

# Train the ID3 classification tree on the training data
features = X_train.columns.tolist()  # Use all columns as features
classification_tree = id3_classification(train_data, train_data, features)

# Pretty-print the generated classification tree
pprint(classification_tree)

# Function to predict using the classification tree
def predict_classification(query, tree, default=None):
    # Check if the tree is a leaf node (i.e., a class label)
    if not isinstance(tree, dict):
        return tree  # Leaf node with predicted class label
    
    # Traverse the tree using the query
    for key in list(query.keys()):
        if key in tree:
            result = tree[key].get(query[key], default)
            # If result is a subtree, continue down the tree
            if isinstance(result, dict):
                return predict_classification(query, result, default)
            else:
                return result
    return default  # Return the default if no key is found in the tree

# Test the classification tree on the test data
def test_classification_tree(test_data, tree):
    queries = test_data.to_dict(orient="records")
    predictions = pd.Series([predict_classification(query, tree, default=train_data['class'].mode()[0]) for query in queries], index=test_data.index)
    return predictions

# Predict the values for the test set
predictions = test_classification_tree(X_test, classification_tree)

# Print actual vs predicted values
print("\nActual vs Predicted:")
print(pd.DataFrame({'Actual': y_test['class'], 'Predicted': predictions}))

# Calculate accuracy
accuracy = np.mean(predictions == y_test['class'])
print("\nAccuracy of ID3 Classification Tree:", accuracy)

print(f"\nTest Accuracy: {accuracy*100:.2f}%")


{'alm1': {np.int64(0): 'cp',
          np.int64(1): {'gvh': {np.int64(0): 'cp',
                                np.int64(1): {'mcg': {np.int64(0): 'cp',
                                                      np.int64(1): {'aac': {np.int64(1): 'cp',
                                                                            np.int64(2): {'alm2': {np.int64(1): 'cp',
                                                                                                   np.int64(2): {'lip': {np.int64(0): {'chg': {np.int64(0): 'cp'}}}}}},
                                                                            np.int64(3): {'alm2': {np.int64(1): 'pp',
                                                                                                   np.int64(2): 'cp'}}}},
                                                      np.int64(2): 'cp',
                                                      np.int64(3): 'cp'}},
                                np.int64(2): {'aac': {np.int64(1): {'mcg': {np

# Ecoli Classification Tree

In [29]:
import pandas as pd
import numpy as np
from pprint import pprint
from ucimlrepo import fetch_ucirepo

# Fetch the E. coli dataset
ecoli = fetch_ucirepo(id=39)

# Data (as pandas DataFrames)
X = ecoli.data.features  # Features
y = ecoli.data.targets    # Target variable (class)
z = ecoli.attributes      # Attribute names

# Combine features and target into a single DataFrame
data = pd.DataFrame(data=X, columns=z)
data['class'] = y  # Add the target variable to the DataFrame

# Function to split the dataset into training and testing sets
def split_data(data, test_size):
    data = data.sample(frac=1, random_state=42)  # Shuffle with a fixed seed for reproducibility
    train_data = data.iloc[:int(len(data) * (1 - test_size))]
    test_data = data.iloc[int(len(data) * test_size):]
    return train_data, test_data

# Split the data into 80% training and 20% testing
train_data, test_data = split_data(data, 0.2)

# Calculate entropy
def entropy(data, target_col):
    values, counts = np.unique(data[target_col], return_counts=True)
    entropy = -sum((counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(values)))
    return entropy

# Calculate information gain
def info_gain(data, split_attribute, target_col):
    total_entropy = entropy(data, target_col)
    values, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == values[i]], target_col)
                           for i in range(len(values)))
    return total_entropy - weighted_entropy

# ID3 algorithm to build the classification tree
def id3(data, original_data, features, target_col, parent_node=None):
    unique_targets = np.unique(data[target_col])

    # If all target values are the same, return that value
    if len(unique_targets) == 1:
        return unique_targets[0]

    # If the dataset is empty, return the mode target in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_col])[np.argmax(np.unique(original_data[target_col], return_counts=True)[1])]

    # If there are no more features to split on, return the parent node
    elif len(features) == 0:
        return parent_node

    # Otherwise, select the best feature to split on
    else:
        # Set the default value for the parent node (majority class)
        parent_node = np.unique(data[target_col])[np.argmax(np.unique(data[target_col], return_counts=True)[1])]
        
        # Select the feature with the highest information gain
        item_gains = [info_gain(data, feature, target_col) for feature in features]
        best_feature_index = np.argmax(item_gains)
        best_feature = features[best_feature_index]
        
        # Create the tree structure
        tree = {best_feature: {}}
        
        # Remove the best feature from the list of features
        features = [f for f in features if f != best_feature]
        
        # Grow a branch under each value of the best feature
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, original_data, features, target_col, parent_node)
            tree[best_feature][value] = subtree
        
        return tree

# Build the tree using the ID3 algorithm
features = train_data.columns[:-1]  # All columns except the target
target_col = 'class'
tree = id3(train_data, train_data, features, target_col)
print("\nID3 Classification Tree for E. coli Dataset:")
pprint(tree)

# Prediction function to classify new data based on the tree
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    feature = next(iter(tree))
    value = instance[feature]
    if value in tree[feature]:
        return predict(tree[feature][value], instance)
    else:
        return None  # Handle unseen feature values gracefully

# Evaluate the model on the test data
def evaluate(tree, test_data, target_col):
    predictions = test_data.apply(lambda x: predict(tree, x), axis=1)
    actual_values = test_data[target_col]
    accuracy = (predictions == actual_values).mean() * 100
    print("\nPredicted vs Actual:")
    for pred, actual in zip(predictions, actual_values):
        print(f"Predicted: {pred}, Actual: {actual}")
    print(f"\nAccuracy: {accuracy:.2f}%")
    return accuracy

# Test and evaluate the tree
accuracy = evaluate(tree, test_data, target_col)
print(f"\nTest Accuracy: {accuracy:.2f}%")



ID3 Classification Tree for E. coli Dataset:
{'alm1': {np.float64(0.03): 'cp',
          np.float64(0.06): 'cp',
          np.float64(0.11): 'cp',
          np.float64(0.14): 'cp',
          np.float64(0.16): 'cp',
          np.float64(0.17): 'cp',
          np.float64(0.18): 'cp',
          np.float64(0.2): 'cp',
          np.float64(0.21): 'cp',
          np.float64(0.22): 'cp',
          np.float64(0.23): 'cp',
          np.float64(0.24): 'cp',
          np.float64(0.25): 'cp',
          np.float64(0.26): 'cp',
          np.float64(0.27): 'cp',
          np.float64(0.28): {'gvh': {np.float64(0.3): 'cp',
                                     np.float64(0.34): 'cp',
                                     np.float64(0.36): 'cp',
                                     np.float64(0.39): 'pp',
                                     np.float64(0.4): 'cp',
                                     np.float64(0.42): 'cp',
                                     np.float64(0.43): 'cp',
                    

# Robot Execution Failures


# Split test and train data Robot Data Set

In [19]:
# Read and concatenate the robotic failure datasets
data = pd.concat([pd.read_csv('LP1.csv'), pd.read_csv('LP2.csv'), pd.read_csv('LP3.csv'), pd.read_csv('LP4.csv'), pd.read_csv('LP5.csv')], ignore_index=True)

# Drop the 'ok' column if it exists
if 'ok' in data.columns:
    data = data.drop(columns=['ok'])

# Forward-fill missing values in the 'Target' column and fill remaining NaNs with 'normal'
data['normal'] = data['normal'].ffill().fillna('normal')

# Rename 'normal' to 'Target' and the first six columns as requested
data.columns = ['Target', 'Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz'] + list(data.columns[7:])

# Remove rows with any missing values in Fx, Fy, Fz, Tx, Ty, or Tz columns
data.dropna(subset=['Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz'], inplace=True)

# Move the 'Target' column to the last position
columns = [col for col in data.columns if col != 'Target'] + ['Target']
data = data[columns]

# Save the cleaned dataset to a new CSV file
data.to_csv('LP_Sorted_Dataset.csv', index=False)

# Display the first few rows of the processed data to verify
print(data)

# 3) Create a function to split the dataset into training and testing sets
def split_data(data, test_size):
    data = data.sample(frac=1)  # Shuffle the data
    train_data = data.iloc[:int(len(data)*(1-test_size))]
    test_data = data.iloc[int(len(data)*(1-test_size)):]
    return train_data, test_data

# Split the data into 80% training and 20% testing
train_data, test_data = split_data(data, 0.2)
print("Training Data:")
print(train_data.head())
print(train_data.shape)

print("\nTesting Data:")
print(test_data.head())
print(test_data.shape)





       Fx   Fy    Fz   Tx   Ty   Tz             Target
0    -1.0 -1.0  63.0 -3.0 -1.0  0.0             normal
1     0.0  0.0  62.0 -3.0 -1.0  0.0             normal
2    -1.0 -1.0  61.0 -3.0  0.0  0.0             normal
3    -1.0 -1.0  63.0 -2.0 -1.0  0.0             normal
4    -1.0 -1.0  63.0 -3.0 -1.0  0.0             normal
...   ...  ...   ...  ...  ...  ...                ...
8314  1.0  1.0   8.0  0.0  3.0 -4.0  collision_in_tool
8315  0.0  0.0   5.0  0.0  3.0 -4.0  collision_in_tool
8316 -1.0  1.0  -3.0 -3.0 -2.0 -3.0  collision_in_tool
8317  0.0 -1.0  -5.0 -1.0  1.0 -3.0  collision_in_tool
8318 -1.0  1.0   4.0  0.0 -1.0 -3.0  collision_in_tool

[6945 rows x 7 columns]
Training Data:
        Fx    Fy     Fz    Tx     Ty   Tz             Target
5733  81.0  18.0  104.0 -44.0  110.0 -7.0  collision_in_tool
4806   7.0   3.0 -160.0  10.0    4.0  4.0        obstruction
1822  -5.0  -9.0   86.0   9.0  -22.0 -2.0             normal
6348  -2.0   1.0   19.0   4.0   -6.0  0.0             no

# Regression Tree Robot Data Set

In [27]:
# Load the data and clean it as specified
data = pd.concat([pd.read_csv('LP1.csv'), pd.read_csv('LP2.csv'), pd.read_csv('LP3.csv'), pd.read_csv('LP4.csv'), pd.read_csv('LP5.csv')], ignore_index=True)

# Drop 'ok' column if it exists and clean target values
if 'ok' in data.columns:
    data = data.drop(columns=['ok'])

data['normal'] = data['normal'].ffill().fillna('normal')
data.columns = ['Target', 'Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz'] + list(data.columns[7:])
data.dropna(subset=['Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz'], inplace=True)
data = data[[col for col in data.columns if col != 'Target'] + ['Target']]

# Split data into train and test sets
def split_data(data, test_size):
    data = data.sample(frac=1, random_state=42)  # Shuffle with fixed seed
    train_data = data.iloc[:int(len(data) * (1 - test_size))]
    test_data = data.iloc[int(len(data) * (1 - test_size)):]
    return train_data, test_data

train_data, test_data = split_data(data, 0.2)

# Calculate entropy
def entropy(data, target_col):
    values, counts = np.unique(data[target_col], return_counts=True)
    entropy = -sum((counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(values)))
    return entropy

# Calculate information gain
def info_gain(data, split_attribute, target_col):
    total_entropy = entropy(data, target_col)
    values, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == values[i]], target_col)
                           for i in range(len(values)))
    return total_entropy - weighted_entropy

# ID3 algorithm to build tree
def id3(data, original_data, features, target_col, parent_node=None):
    unique_targets = np.unique(data[target_col])

    if len(unique_targets) == 1:
        return unique_targets[0]

    elif len(data) == 0:
        most_common_target = np.unique(original_data[target_col])[np.argmax(np.unique(original_data[target_col], return_counts=True)[1])]
        return most_common_target

    elif len(features) == 0:
        return parent_node

    else:
        parent_node = np.unique(data[target_col])[np.argmax(np.unique(data[target_col], return_counts=True)[1])]
        item_gains = [info_gain(data, feature, target_col) for feature in features]
        best_feature_index = np.argmax(item_gains)
        best_feature = features[best_feature_index]

        tree = {best_feature: {}}
        features = [f for f in features if f != best_feature]

        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, original_data, features, target_col, parent_node)
            tree[best_feature][value] = subtree

        return tree

# Build the tree
features = train_data.columns[:-1]  # All columns except the target
target_col = 'Target'
tree = id3(train_data, train_data, features, target_col)
print("\nID3 Decision Tree:")
pprint(tree)

# Prediction function for new data
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    feature = next(iter(tree))
    value = instance[feature]
    if value in tree[feature]:
        return predict(tree[feature][value], instance)
    else:
        return np.nan  # if a value in test set not in training splits

# Make predictions and calculate accuracy
predictions = test_data.apply(lambda x: predict(tree, x), axis=1)
actual_values = test_data['Target']

# Calculate accuracy
accuracy = (predictions == actual_values).mean() * 100
print("\nPredicted vs Actual:")
for pred, actual in zip(predictions, actual_values):
    print(f"Predicted: {pred}, Actual: {actual}")

print(f"\nAccuracy: {accuracy:.2f}%")



ID3 Decision Tree:
{'Fz': {np.float64(-3617.0): {'Fx': {np.float64(353.0): {'Fy': {np.float64(-169.0): {'Tx': {np.float64(324.0): {'Ty': {np.float64(-265.0): {'Tz': {np.float64(47.0): 'bottom_obstruction'}}}}}}}}}},
        np.float64(-3557.0): 'bottom_obstruction',
        np.float64(-3451.0): {'Fx': {np.float64(-883.0): {'Fy': {np.float64(-260.0): {'Tx': {np.float64(-175.0): {'Ty': {np.float64(-434.0): {'Tz': {np.float64(21.0): 'bottom_obstruction'}}}}}}}}}},
        np.float64(-3348.0): {'Fx': {np.float64(351.0): {'Fy': {np.float64(-368.0): {'Tx': {np.float64(282.0): {'Ty': {np.float64(-62.0): {'Tz': {np.float64(-31.0): 'bottom_obstruction'}}}}}}}}}},
        np.float64(-3292.0): {'Fx': {np.float64(337.0): {'Fy': {np.float64(-364.0): {'Tx': {np.float64(290.0): {'Ty': {np.float64(-78.0): {'Tz': {np.float64(-34.0): 'bottom_obstruction'}}}}}}}}}},
        np.float64(-3281.0): 'obstruction',
        np.float64(-3277.0): {'Fx': {np.float64(460.0): {'Fy': {np.float64(-65.0): {'Tx': {np.f

# Classification Tree Robot Data Set

In [28]:
# Load and clean the dataset
data = pd.concat([pd.read_csv('LP1.csv'), pd.read_csv('LP2.csv'), pd.read_csv('LP3.csv'), pd.read_csv('LP4.csv'), pd.read_csv('LP5.csv')], ignore_index=True)

# Drop 'ok' column if it exists
if 'ok' in data.columns:
    data = data.drop(columns=['ok'])

# Forward-fill missing values in the 'Target' column and fill remaining NaNs with 'normal'
data['normal'] = data['normal'].ffill().fillna('normal')
data.columns = ['Target', 'Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz'] + list(data.columns[7:])
data.dropna(subset=['Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz'], inplace=True)
data = data[[col for col in data.columns if col != 'Target'] + ['Target']]

# Split data into training and testing sets
def split_data(data, test_size):
    data = data.sample(frac=1, random_state=42)  # Shuffle with fixed seed
    train_data = data.iloc[:int(len(data) * (1 - test_size))]
    test_data = data.iloc[int(len(data) * test_size):]
    return train_data, test_data

train_data, test_data = split_data(data, 0.2)

# Calculate entropy
def entropy(data, target_col):
    values, counts = np.unique(data[target_col], return_counts=True)
    entropy = -sum((counts[i] / np.sum(counts)) * np.log2(counts[i] / np.sum(counts)) for i in range(len(values)))
    return entropy

# Calculate information gain
def info_gain(data, split_attribute, target_col):
    total_entropy = entropy(data, target_col)
    values, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == values[i]], target_col)
                           for i in range(len(values)))
    return total_entropy - weighted_entropy

# ID3 algorithm to build the classification tree
def id3(data, original_data, features, target_col, parent_node=None):
    unique_targets = np.unique(data[target_col])
    
    # If all targets are the same, return this value
    if len(unique_targets) == 1:
        return unique_targets[0]

    # If the dataset is empty, return the mode target in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_col])[np.argmax(np.unique(original_data[target_col], return_counts=True)[1])]

    # If there are no more features to split on, return the parent node
    elif len(features) == 0:
        return parent_node

    # Otherwise, select the best feature to split on
    else:
        # Set the default value for the parent node (majority class)
        parent_node = np.unique(data[target_col])[np.argmax(np.unique(data[target_col], return_counts=True)[1])]
        
        # Select the feature which best splits the dataset
        item_gains = [info_gain(data, feature, target_col) for feature in features]
        best_feature_index = np.argmax(item_gains)
        best_feature = features[best_feature_index]
        
        # Create the tree structure
        tree = {best_feature: {}}
        
        # Remove the best feature from the list of features
        features = [f for f in features if f != best_feature]
        
        # Grow a branch under each value of the best feature
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = id3(sub_data, original_data, features, target_col, parent_node)
            tree[best_feature][value] = subtree
        
        return tree

# Build the tree using the ID3 algorithm
features = train_data.columns[:-1]  # All columns except the target
target_col = 'Target'
tree = id3(train_data, train_data, features, target_col)
print("\nID3 Classification Tree:")
pprint(tree)

# Prediction function to classify new data based on the tree
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    feature = next(iter(tree))
    value = instance[feature]
    if value in tree[feature]:
        return predict(tree[feature][value], instance)
    else:
        return None  # Handle unseen feature values gracefully

# Evaluate the model on the test data
def evaluate(tree, test_data, target_col):
    predictions = test_data.apply(lambda x: predict(tree, x), axis=1)
    actual_values = test_data[target_col]
    accuracy = (predictions == actual_values).mean() * 100
    print("\nPredicted vs Actual:")
    for pred, actual in zip(predictions, actual_values):
        print(f"Predicted: {pred}, Actual: {actual}")
    print(f"\nAccuracy: {accuracy:.2f}%")
    return accuracy

# Test and evaluate the tree
accuracy = evaluate(tree, test_data, target_col)
print(f"\nTest Accuracy: {accuracy:.2f}%")



ID3 Classification Tree:
{'Fz': {np.float64(-3617.0): {'Fx': {np.float64(353.0): {'Fy': {np.float64(-169.0): {'Tx': {np.float64(324.0): {'Ty': {np.float64(-265.0): {'Tz': {np.float64(47.0): 'bottom_obstruction'}}}}}}}}}},
        np.float64(-3557.0): 'bottom_obstruction',
        np.float64(-3451.0): {'Fx': {np.float64(-883.0): {'Fy': {np.float64(-260.0): {'Tx': {np.float64(-175.0): {'Ty': {np.float64(-434.0): {'Tz': {np.float64(21.0): 'bottom_obstruction'}}}}}}}}}},
        np.float64(-3348.0): {'Fx': {np.float64(351.0): {'Fy': {np.float64(-368.0): {'Tx': {np.float64(282.0): {'Ty': {np.float64(-62.0): {'Tz': {np.float64(-31.0): 'bottom_obstruction'}}}}}}}}}},
        np.float64(-3292.0): {'Fx': {np.float64(337.0): {'Fy': {np.float64(-364.0): {'Tx': {np.float64(290.0): {'Ty': {np.float64(-78.0): {'Tz': {np.float64(-34.0): 'bottom_obstruction'}}}}}}}}}},
        np.float64(-3281.0): 'obstruction',
        np.float64(-3277.0): {'Fx': {np.float64(460.0): {'Fy': {np.float64(-65.0): {'Tx':