In [103]:
# Part 2: Random forests (No Pruning)
# Aspects are adapted from SENG 474 Laboratory 1, Authors unknown, Summer 2020
import sklearn as ak
import matplotlib.pyplot as plt
import numpy as np

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.datasets import load_files, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Reused from decision Trees, plots learning curves of Forests
def learning_curve_plot(tree, data, target, train_sizes, size):
    train_sizes, train_scores, validation_scores = learning_curve(tree, X = data, y = target, train_sizes = train_sizes, cv = None, scoring="accuracy")
    train_scores_mean = train_scores.mean(axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = validation_scores.mean(axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)
    plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1)
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1)
    plt.plot(train_sizes, train_scores_mean, label = 'Training: # Trees = '+ str(size))
    plt.plot(train_sizes, validation_scores_mean, label = 'CV: # Trees = '+ str(size))
    plt.ylabel('Accuracy', fontsize = 14)
    plt.xlabel('Training Size', fontsize = 14)
    plt.legend()

# Reused from decision Trees, prints various statistical information regarding test data
def print_info(clf, y_test, y_pred):
    clf.fit(data, target)
    y_pred = clf.predict(x_test)
    print("\n")
    print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
    print('Classification Report:\n', classification_report(y_test, y_pred))
    print('Accuracy Score: ', accuracy_score(y_test, y_pred))
    print("\n")

if __name__ == "__main__":
    # Preparing Dataset Cleveland Heart Disease
    cleveland_hd = np.loadtxt('cleaned_processed.cleveland.data', delimiter = ',')
    size = cleveland_hd.shape[1]
    feature_names = ['age','sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
    class_names = ['no heart disease', 'heart disease']
    data = [i[0:size-1] for i in cleveland_hd]
    target = [i[size-1] for i in cleveland_hd]
    x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0)
    amount_x = len(x_train)
    amount_y = len(y_train)
    train_sizes = [1, int(amount_x*0.1), int(amount_x*0.25), int(amount_x*0.5), int(amount_x*0.75), amount_x]

    # Preparing Dataset Breast Cancer Data
    # data, target = load_breast_cancer(return_X_y=True)
    # breast_cancer = load_breast_cancer()
    # feature_names = breast_cancer.feature_names
    # class_names = breast_cancer.target_names
    # x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0)
    # amount_x = len(x_train)
    # amount_y = len(y_train)
    # train_sizes = [1, int(amount_x*0.1), int(amount_x*0.25), int(amount_x*0.5), int(amount_x*0.75), amount_x]

    # Random Forests sampled with replacement (bootstrap = True), forest size = 5-200, 2 features
    # clf = RandomForestClassifier(n_estimators=200, random_state=0, max_features=2, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 200)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 100)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=50, random_state=0, max_features=2, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 50)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=25, random_state=0, max_features=2, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 25)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=10, random_state=0, max_features=2, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 10)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=5, random_state=0, max_features=2, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 5)
    # print_info(clf, y_test, y_pred)

    # Random Forests sampled with replacement (bootstrap = True), forest size = 5-200, log2(d) features
    # clf = RandomForestClassifier(n_estimators=200, random_state=0, max_features='log2', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 200)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=100, random_state=0, max_features='log2', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 100)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=50, random_state=0, max_features='log2', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 50)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=25, random_state=0, max_features='log2', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 25)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=10, random_state=0, max_features='log2', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 10)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=5, random_state=0, max_features='log2', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 5)
    # print_info(clf, y_test, y_pred)
  
  
    # Random Forests sampled with replacement (bootstrap = True), forest size = 5-200, sqrt(d) features
    # clf = RandomForestClassifier(n_estimators=200, random_state=0, max_features='auto', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 200)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=100, random_state=0, max_features='auto', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 100)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=50, random_state=0, max_features='auto', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 50)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=25, random_state=0, max_features='auto', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 25)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=10, random_state=0, max_features='auto', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 10)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=5, random_state=0, max_features='auto', max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 5)
    # print_info(clf, y_test, y_pred)

    # Random Forests sampled with replacement (bootstrap = True), forest size = 5-200, all features
    # clf = RandomForestClassifier(n_estimators=200, random_state=0, max_features=None, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 200)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=None, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 100)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=50, random_state=0, max_features=None, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 50)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=25, random_state=0, max_features=None, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 25)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=10, random_state=0, max_features=None, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 10)
    # print_info(clf, y_test, y_pred)
    # clf = RandomForestClassifier(n_estimators=5, random_state=0, max_features=None, max_depth=3)
    # learning_curve_plot(clf, data, target, train_sizes, 5)
    # print_info(clf, y_test, y_pred)