In [50]:
# Part 1: Decision Trees with Post-pruning
# Aspects are adapted from SENG 474 Laboratory 1, Authors unknown, Summer 2020
import sklearn as ak
import matplotlib.pyplot as plt
import numpy as np

from sklearn import tree, metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, learning_curve, validation_curve
from sklearn.datasets import load_files, load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree._tree import TREE_LEAF

# Prunes nodes that decrease or do not change the RMSE of a tree 
def reduced_error_prune(tree, x_validate, y_validate, root):
# Post Order Traversal of nodes
    root_left = tree.tree_.children_left[root]
    root_right = tree.tree_.children_right[root]
    if (root_left!= TREE_LEAF):
        reduced_error_prune(tree, x_validate, y_validate, root_left)
    if (root_right!= TREE_LEAF):
        reduced_error_prune(tree, x_validate, y_validate, root_right)
# Test to Prune Node
    tree = prune_index(tree, x_validate, y_validate, root)
    return tree


# Helper Function for Reduced Error Pruning, Sets L and R nodes to Leaves
def prune_index(tree, x_validate, y_validate, root):
# Store original children temporarily
    left = tree.tree_.children_left[root]
    right = tree.tree_.children_right[root]
# Obtain RMSE of passed tree
    y_pred = tree.predict(x_validate)
    rmse = np.sqrt(metrics.mean_squared_error(y_validate, y_pred))
# remove children
    tree.tree_.children_left[root] = TREE_LEAF
    tree.tree_.children_right[root] = TREE_LEAF
# Obtain RMSE of new tree
    y_pred_new = tree.predict(x_validate)
    rmse_compare = np.sqrt(metrics.mean_squared_error(y_validate, y_pred_new))
# Prune node based on lower (or equal RMSE)
    if rmse_compare <= rmse:
        return tree
    else:
        tree.tree_.children_left[root] = left
        tree.tree_.children_right[root] = right
        return tree

# Plots validation curve data of a tree
def validation_curve_plot(tree, data, target, train_sizes, criterion, pruned):
    if pruned == True:
        pruned = 'Pruned'
    else: 
        pruned = 'Unpruned'
    param_range = np.arange(1, 25, 2)
    train_scores, test_scores = validation_curve(tree, data, target, param_name="max_depth", param_range=param_range, scoring="accuracy")
    train_scores_mean = train_scores.mean(axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = test_scores.mean(axis=1)
    validation_scores_std = np.std(test_scores, axis=1)
    if criterion == 'gini':
        criterion = 'Gini'
    else: 
        criterion = 'Entropy'
    plt.fill_between(param_range, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1)
    plt.fill_between(param_range, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1)
    plt.plot(param_range, validation_scores_mean, label = 'CV score: ' +criterion + ' ' + pruned)
    plt.plot(param_range, train_scores_mean, label = 'Training score: ' + criterion + ' ' + pruned)
    plt.ylabel('Accuracy', fontsize = 14)
    plt.xlabel('Max Depth', fontsize = 14)
    plt.legend()

# Plots learning curve data of a tree
def learning_curve_plot(tree, data, target, train_sizes, criterion, pruned):
    if pruned == True:
        pruned = 'Pruned'
    else: 
        pruned = 'Unpruned'
    train_sizes, train_scores, validation_scores = learning_curve(tree, X = data, y = target, train_sizes = train_sizes, scoring = 'accuracy')
    train_scores_mean = train_scores.mean(axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = validation_scores.mean(axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)
    if criterion == 'gini':
        criterion = 'Gini'
    else: 
        criterion = 'Entropy'
    plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1)
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1)
    plt.plot(train_sizes, validation_scores_mean, label = 'CV score: ' +criterion + ' ' + pruned)
    plt.plot(train_sizes, train_scores_mean, label = 'Training score: ' + criterion + ' ' + pruned)
    plt.ylabel('Accuracy', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    plt.legend()

# Prints the Mean Absolute Error, RMSE, Confusion Matrix, Classification Matrix, F1-Values, and Accuracy Score of the Tree's test data
def print_info(y_test, y_pred):
    print('\nMean Absolute Error: ', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print('Classification Report:\n', classification_report(y_test, y_pred))
    print('Accuracy Score: ', accuracy_score(y_test, y_pred))
    print("\n")


if __name__ == "__main__":
    # Preparing Dataset Cleveland Heart Disease
    cleveland_hd = np.loadtxt('cleaned_processed.cleveland.data', delimiter = ',')
    size = cleveland_hd.shape[1]
    feature_names = ['age','sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
    class_names = ['no heart disease', 'heart disease']
    data = [i[0:size-1] for i in cleveland_hd]
    target = [i[size-1] for i in cleveland_hd]
    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0)
    amount_x = len(x_train)
    amount_y = len(y_train)
    train_sizes = [1, int(amount_x*0.1), int(amount_x*0.25), int(amount_x*0.5), int(amount_x*0.75), amount_x]

    # Preparing Dataset Breast Cancer Data
    # data, target = load_breast_cancer(return_X_y=True)
    # breast_cancer = load_breast_cancer()
    # feature_names = breast_cancer.feature_names
    # class_names = breast_cancer.target_names
    # x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0)
    # amount_x = len(x_train)
    # amount_y = len(y_train)
    # train_sizes = [1, int(amount_x*0.1), int(amount_x*0.25), int(amount_x*0.5), int(amount_x*0.75), amount_x]


    # Unmodifided Decision Tree Split based on Gini Index
    print("Gini Index:")
    # decision_tree = tree.DecisionTreeClassifier(criterion='gini')
    # g_clf = decision_tree.fit(x_train, y_train)
    # y_pred = g_clf.predict(x_test)
    # learning_curve_plot(g_clf, data, target, train_sizes, 'gini', False)
    # validation_curve_plot(decision_tree, data, target, train_sizes, 'gini', False)
    # print_info(y_test, y_pred)
    # visual = tree.plot_tree(g_clf, filled=True, class_names=class_names, feature_names=feature_names)

    # Post Pruning Decision Tree Split based on Gini Index
    # g_clf = reduced_error_prune(g_clf, x_test, y_test, 0)
    # y_pred_new = g_clf.predict(x_test)
    # learning_curve_plot(g_clf, data, target, train_sizes, 'gini', True)
    # print_info(y_test, y_pred_new)
    # visual_new = tree.plot_tree(g_clf, filled=True, class_names=class_names, feature_names=feature_names)

    # # Unmodifided Decision Tree Split based on Information Gain (Entropy)
    print("\n")
    print("Information Gain (Entropy):")
    # decision_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3)
    # g_clf = decision_tree.fit(x_train, y_train)
    # y_pred = g_clf.predict(x_test)
    # learning_curve_plot(g_clf, data, target, train_sizes, 'entropy', False)
    # validation_curve_plot(decision_tree, data, target, train_sizes, 'entropy', False)
    # print_info(y_test, y_pred)
    # visual = tree.plot_tree(g_clf, filled=True, class_names=class_names, feature_names=feature_names)

    # Post Pruning Decision Tree Split based on Information Gain (Entropy)
    # g_clf = reduced_error_prune(g_clf, x_test, y_test, 0)
    # y_pred_new = g_clf.predict(x_test)
    # learning_curve_plot(g_clf, data, target, train_sizes, 'entropy', True)
    # print_info(y_test, y_pred_new)
    # visual_new = tree.plot_tree(g_clf, filled=True, class_names=class_names, feature_names=feature_names)

    # title = 'Validation Curve For Unconstrained Decision Tree Split With Gini Index'
    # title = 'Validation Curve For Unconstrained Decision Tree Split With Entropy'
    # plt.title(title, fontsize = 18, y = 1.03)

Gini Index:


Information Gain (Entropy):
