# Final Project - Random Forests
## Hudson Arney & Ian Golvach

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_decision_regions
from matplotlib.colors import ListedColormap

In [None]:
df = pd.read_csv('Social_Network_Ads.csv')
df.head()

In [None]:
df.head()

In [None]:
colors = ['#0000ff', '#00ff00']
cmap = ListedColormap(colors)
plt.scatter(data=df, x='Age', y='EstimatedSalary', c='Purchased', cmap=cmap)
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.title('Age vs Estimated Salary')

legend_labels = ['Not Purchased', 'Purchased']
legend_markers = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#0000ff', markersize=10),
                  plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#00ff00', markersize=10)]
plt.legend(legend_markers, legend_labels, loc='upper left')

In [None]:
df.describe()

In [None]:
df.info()

___
## Scratch implemntation of RF
Learning sourced from: https://en.wikipedia.org/wiki/Random_forest

Based on the wikipedia article, we will need to modify a standard decision tree model so that
 - Along with the gini coef. split, we select a random subset of the feature to consider when making splits
 - Several decision trees are trained with a 'bagging' approach, training the trees on a random swample w/ replacement of the data.
 
Following this, it makes the most sense that we first implement a method that will handle the sampling and training, as well as the predicting via plurality, and another method that will server as the model itself for the underlying decision trees (that way we can test the bagging concept with other models)
___

In [None]:
class bagging_trees:
    """Bootstrap aggregates a decision tree, bagging B times"""
    def __init__(self, model, B=5, is_classifier=True):
        self.B = B
        self.model = model
        self.fitted_models = []
        self.is_classifier = is_classifier

    def fit(self, X, y):
        # Create B models fitted on n sets created by randomly sampling with replacement from X and y
        if(X.shape[0] != y.shape[0]):
            raise Exception("X and y do not have the same number of columns")
        for i in range(self.B):
            self.__progress(i)
            bag_X = []
            bag_y = []
            for i in range(X.shape[0]):
                sample_ind = np.random.randint(X.shape[0])
                bag_X.append(X[sample_ind])
                bag_y.append(y[sample_ind])
            new_model = self.model()
            new_model.fit(np.array(bag_X), np.array(bag_y))
            self.fitted_models.append(new_model)
        self.__progress(self.B+1)
            
    def predict(self, X_pred):
        # Return the average of predictions if regression, otherwise return plurality of predictions
        # It may be advantageous to later replace these with np functions
        predictions = []
        for i in range(self.B):
            prediction = self.fitted_models[i].predict(X_pred)
            if np.issubdtype(prediction.dtype, np.number):  # Check if prediction is numeric
                predictions.append(prediction)
        if not predictions:  # Check if predictions list is empty
            raise ValueError("No numeric predictions found")
        
        predictions_array = np.array(predictions)  # Convert to numpy array
        if self.is_classifier:
            return np.ravel(scipy.stats.mode(predictions_array.T, axis=1, keepdims=False)[0])
        else:
            return np.mean(predictions_array.T, axis=1)
        
    def __progress(self, current):
        if(current-1 == self.B):
            print("[=] Sub-model "+str(self.B)+'/'+str(self.B)+' fitted. Fitting complete!')
        else:
            spinner = ' '
            # |/-\
            spinny = current%4
            if(spinny==0):
                spinner = '|'
            elif(spinny==1):
                spinner = '/'
            elif(spinny==2):
                spinner = '-'
            elif(spinny==3):
                spinner = '\\'
            print("["+spinner+"] Sub-model "+str(current+1)+'/'+str(self.B)+' fitting...', end='\r')

        
        

This code is just a proof of concept that the class actually works, which is why train/test splitting was not performed.

In [None]:
bag_trees = bagging_trees(DecisionTreeClassifier)

In [None]:
X = np.array(df[['Age','EstimatedSalary']])
y= np.array(df['Purchased'])

In [None]:
bag_trees.fit(X, y)

In [None]:
bag_trees.predict(X)

Now we implement a decision tree that picks features at random.

In [None]:
y/2

In [None]:
np.unique(y, return_counts=True)

In [None]:
test_array = [1,3,4,2,5]
test_array.sort()
test_array.remove(2)
test_array

___

In [None]:
class rdt_node:
    #type(X)==np.ndarray
    """
    Node for random_decision_tree, has a boundary, variable number, and between 0 and 2 children
    Children of this node can either be another node, or a value.
       
    var_num: the variable that is compared at this node
    boundary: the boundary along the variable, <= is left, otherwise right
    left: the left child of the node, or the value of the leaf
    right: the right child of the node, or the value of the leaf
    """
    def __init__(self, var_num=0, boundary=0, left=None, right=None):
        self.var_num = var_num
        self.boundary = boundary
        self.left = left
        self.right = right


class random_decision_tree:
    """
    Decision Tree that follows random forest standards
    
    depth_limit: how deep the forest is allowed to go before stopping
    feature_subset_func: the function used to determine how many feature to use based on number of features
    """
    def __init__(self, depth_limit=5, feature_subset_func=lambda x: math.floor(pow(x,1/2))):
        self.depth_limit = 5
        self.root = rdt_node();
        self.num_features = -1;
        self.feature_subset_func = feature_subset_func
    
    def fit(self, X_train, y_train):
        # Remove duplicate observations
        unique_indices = np.unique(X_train, axis=0, return_index=True)[1]
        X_train_unique = X_train[unique_indices]
        y_train_unique = y_train[unique_indices]
        
        self.num_features = X_train_unique.shape[1]
        self.__fit(X_train_unique, y_train_unique, 0, self.root)

        
        
    def __fit(self, X_train, y_train, depth, node):
        self.num_features = X_train.shape[1]
        gini_result = self.__find_gini(X_train, y_train, self.__array_for_range(X_train.shape[1]))
        if gini_result[0] is None:  # Check if boundary is None
            return
        node.var_num = gini_result[1]
        node.boundary = gini_result[0]
        
        if self.depth_limit == depth or gini_result[2]:
            node.left = scipy.stats.mode(y_train[X_train[:, node.var_num] <= node.boundary], keepdims=False)[0]
            node.right = scipy.stats.mode(y_train[X_train[:, node.var_num] > node.boundary], keepdims=False)[0]
        else:
            node.left = rdt_node()
            node.right = rdt_node()
            self.__fit(X_train[X_train[:, node.var_num] <= node.boundary], y_train[X_train[:, node.var_num] <= node.boundary], depth + 1, node.left)
            self.__fit(X_train[X_train[:, node.var_num] > node.boundary], y_train[X_train[:, node.var_num] > node.boundary], depth + 1, node.right)
            
    def __array_for_range(self, num_for_array):
        # Sometimes the simple solution works
        ret = []
        for i in range(num_for_array):
            ret.append(i)
        return ret
    
    def __find_gini(self, X_train, y_train, features):
        unincluded_features = self.__array_for_range(X_train.shape[1])
        for x in features:
            unincluded_features.remove(x)
        
        boundary_candidates = []
        boundaries = []
        
        for feature_index in features:
            unique_values = np.unique(X_train[:, feature_index])
            
            # Skip features with no variability
            if len(unique_values) == 1:
                continue
            
            if len(unique_values) == 2:
                boundary_candidates.append((np.mean(unique_values), feature_index))
            else:
                for i in range(len(unique_values) - 1):
                    boundary = (unique_values[i] + unique_values[i + 1]) / 2
                    boundary_candidates.append((boundary, feature_index))
        
        if len(boundary_candidates) == 0:
            return (None, None, False)
            # raise Exception("No candidate boundaries found due to lack of variability in features")
        
        for candidate in boundary_candidates:
            boundaries.append((candidate[0], candidate[1], self.__gini_index_for_boundary(X_train, y_train, candidate[0], candidate[1])))
            
        boundaries.sort(key=self.__sorter)
        
        best_boundary = boundaries[0]
        left_leaf = (len(np.unique(y_train[X_train[:, best_boundary[1]] <= best_boundary[0]])) == 1)
        right_leaf = (len(np.unique(y_train[X_train[:, best_boundary[1]] > best_boundary[0]])) == 1)
        is_leaf = left_leaf and right_leaf
        
        return (best_boundary[0], best_boundary[1], is_leaf)


        
        
    def __gini_index_for_boundary(self, X_train, y_train, boundary, feature_num):
        #classification assumed
        
        #calculate proportions of feature_num <= boundary for 1, proportion of feature_num > boundary for 2
        # code pulled from https://stackoverflow.com/questions/67586928/how-can-you-find-what-percentage-of-a-numpy-array-has-some-value-x
        # uniques, counts = np.unique(array, return_counts=True)
        # percentages = dict(zip(uniques, counts * 100 / len(array)))
    
        # r1 is the subset of y masked by X[:,feature_num]<= or > the boundary, because we use it to find the proportions
        R_1 = y_train[X_train[:,feature_num]<=boundary]
        N_1 = R_1.shape[0]
        proportions_1 = np.unique(R_1,return_counts=True)[1]/N_1
        
        R_2 = y_train[X_train[:,feature_num]>boundary]
        N_2 = R_2.shape[0]
        proportions_2 = np.unique(R_2,return_counts=True)[1]/N_2
        
        N = X_train.shape[0]
        
        #compute ginis
        g_1 = 0
        for prop in proportions_1:
            g_1 = g_1 + prop*(1-prop)
            
        g_2 = 0
        for prop in proportions_2:
            g_2 = g_2 + prop*(1-prop)
            
        #return weighted average of ginis
        return ((N_1/N)*(g_1))+((N_2/N)*(g_2))
        
    def __sorter(self, the_tuple):
        return the_tuple[2]
        
    def predict(self, X_pred):
        return np.array([self.__predict(x) for x in X_pred])
    
    def __predict(self, x_pred):
        # For any vector of features given, transverses the tree in order to return the value
        # x_pred is a vector
        tolkein = self.root # could replace this with 'walker' or something else
        while(type(tolkein)==type(self.root)):
            if(x_pred[tolkein.var_num] <= tolkein.boundary):
                tolkein = tolkein.left
            else:
                tolkein = tolkein.right
        return tolkein
    

In [None]:
rdt = random_decision_tree()
rdt.fit(X, y)
predicted = rdt.predict(X)
print(accuracy_score(y,predicted))

In [None]:
rdt.root.var_num

___
Put it all together

In [None]:
random_forest = bagging_trees(random_decision_tree, 100)
random_forest.fit(X, y)
prediction = random_forest.predict(X)
print(accuracy_score(y,prediction))

In [None]:
print(X.shape)
print(y.shape)
print(predicted.shape)

___
### Plot the Decision Boundaries of the Random Forests

In [None]:
colors = ['#87cefa', '#ff0000']
cmap = ListedColormap(colors)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=prediction, cmap=cmap, label='Predicted Labels')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.title('Predicted Labels')

legend_labels = ['Not Purchased', 'Purchased']
legend_markers = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#87cefa', markersize=10),
                  plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#ff0000', markersize=10)]
plt.legend(legend_markers, legend_labels, loc='upper left')

In [None]:
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.5),
                     np.arange(y_min, y_max, 0.5))

Z = rdt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap, s=20, edgecolor='k')

plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.title('Decision Boundary of Random Forest Classifier')
plt.show()