# Final Project - Random Forests
## Hudson Arney & Ian Golvach

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_csv('Social_Network_Ads.csv')
df.head()

In [None]:
df.head()

In [None]:
sns.scatterplot(data=df, x='Age', y='EstimatedSalary', hue='Purchased')
plt.title('Age vs Estimated Salary')

In [None]:
df.describe()

In [None]:
df.info()

## Scratch implemntation of RF
Learning sourced from: https://en.wikipedia.org/wiki/Random_forest

Based on the wikipedia article, we will need to modify a standard decision tree model so that
 - Along with the gini coef. split, we select a random subset of the feature to consider when making splits
 - Several decision trees are trained with a 'bagging' approach, training the trees on a random swample w/ replacement of the data.
 
Following this, it makes the most sense that we first implement a method that will handle the sampling and training, as well as the predicting via plurality, and another method that will server as the model itself for the underlying decision trees (that way we can test the bagging concept with other models)

In [None]:
import numpy as np
import scipy.stats

class bagging_tree:
    """Bootstrap aggregates a decision tree, bagging B times"""
    def __init__(self, model, B=5, is_classifier=True):
        self.B = B
        self.model = model
        self.fitted_models = []
        self.is_classifier = is_classifier

    def fit(self, X, y):
        # Create B models fitted on n sets created by randomly sampling with replacement from X and y
        if(X.shape[0] != y.shape[0]):
            raise Exception("X and y do not have the same number of columns")
        for i in range(self.B):
            bag_X = []
            bag_y = []
            for i in range(X.shape[0]):
                sample_ind = np.random.randint(X.shape[0])
                bag_X.append(X[sample_ind])
                bag_y.append(y[sample_ind])
            new_model = self.model()
            new_model.fit(np.array(bag_X), np.array(bag_y))
            self.fitted_models.append(new_model)
            
    def predict(self, X_pred):
        # Return the average of predictions if regression, otherwise return plurality of predictions
        # It may be advantageous to later replace these with np functions
        predictions = []
        for i in range(self.B):
            predictions.append(self.fitted_models[i].predict(X_pred))
        if(self.is_classifier):
            return np.ravel(scipy.stats.mode(np.array(predictions).T,1)[0])
        else:
            return np.mean(np.array(predictions).T,1)
        
        

This code is just a proof of concept that the class actually works, which is why train/test splitting was not performed.

In [None]:
from sklearn.tree import DecisionTreeClassifier

bag_tree = bagging_tree(DecisionTreeClassifier)

In [None]:
X = np.array(df[['Age','EstimatedSalary']])
y= np.array(df['Purchased'])

In [None]:
bag_tree.fit(X, y)

In [None]:
bag_tree.predict(X)