### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from random import seed
from random import randrange

---
### Upload Dataset

In [2]:
X_und = pd.read_csv("cleaned_rain_x.csv")
y_und = pd.read_csv("cleaned_rain_y.csv")

---
### Split into testing and training data

In [3]:
x_train, x_test, y_train, y_test = train_test_split(X_und, y_und, test_size=0.25, random_state=42)

---
### Create 'Total' Train DataFrame

In [5]:
x_train_total = pd.concat([x_train, y_train], axis=1)
x_train_total.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
x_train_total.head()

Unnamed: 0,MinTemp,MaxTemp,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,RainToday,year,...,WindGustDirW,WindDir9amN,WindDir9amE,WindDir9amS,WindDir9amW,WindDir3pmN,WindDir3pmE,WindDir3pmS,WindDir3pmW,RainTomorrow
22845,7.2,13.5,46.0,17.0,31.0,76.0,61.0,1021.8,1,2015,...,1,0,0,0,1,0,0,1,1,0
42446,21.2,28.8,48.0,19.0,24.0,78.0,97.0,1009.6,0,2014,...,1,1,1,0,0,0,0,0,1,1
5989,16.8,28.0,39.0,19.0,22.0,74.0,60.0,1012.4,0,2015,...,0,0,0,1,1,0,1,1,0,0
39347,12.5,24.4,50.0,6.0,7.0,78.0,43.0,1014.6,0,2014,...,0,0,0,1,1,0,1,1,0,1
31998,6.0,17.2,33.0,9.0,22.0,95.0,55.0,1029.4,1,2009,...,0,1,0,0,1,0,1,1,0,1


---
# Evaluators
---

In [6]:
def TF(y_actu, y_pred):
    TP = np.sum(np.logical_and(y_pred == 1, y_actu == 1))
    TN = np.sum(np.logical_and(y_pred == 0, y_actu == 0))
    FP = np.sum(np.logical_and(y_pred == 1, y_actu == 0))
    FN = np.sum(np.logical_and(y_pred == 0, y_actu == 1))
    return TP, FP, TN, FN

In [7]:
def eval(models, test_x, test_y):
    for m in models:
        model = m[0]
        TP, FP, TN, FN = TF(test_y, np.array(model.predict(test_x)))
        acc = (TP + TN) / (TP + TN + FP + FN)
        pre = TP / (TP + FP)
        re = TP / (FN + TP)
        f1 = (2 * (pre * re)) / (pre + re)
        scoring = {m[1]:{"Accuracy":acc, "Precision":pre,"Recall":re,"F1":f1}}
    return pd.DataFrame(scoring)

---
# Random Forest Implementation
---

In [31]:
class RandomForest():
    
    ## Helper Functions ##
    
    def subsample(self):
        sample = []
        while len(sample) < self.n_sample:
            index = randrange(self.length)
            sample.append(self.data.iloc[index])
        return sample
    
    def test_split(self, index, value, sample):
        left, right = [], []
        for row in sample:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right
    
    def gini_index(self, groups, classes):
        n_instances = float(sum([len(group) for group in groups]))
        gini = 0.0
        for group in groups:
            size = float(len(group))
            if size == 0:
                continue
            score = 0.0
            for class_val in classes:
                p = [row[-1] for row in group].count(class_val) / size
                score += p * p
            gini += (1.0 - score) * (size / n_instances)
        return gini
    
    def get_split(self, sample):
        class_vals = [set(row[-1] for row in sample)]
        b_index, b_value, b_score, b_groups = 999, 999, 999, None
        features = []
        while len(features) < self.length:
            index = randrange(len(sample[0])-1)
            if index not in features:
                features.append(index)
        for index in features:
            for row in sample:
                groups = self.test_split(index, row[index], sample)
                gini = self.gini_index(groups, class_vals)
                if gini < b_score:
                    b_index, b_value, b_score, b_groups = index, row[index], gini, groups
        return {'index':b_index, 'value':b_value, 'groups':b_groups}
    
    def to_terminal(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)
    
    def split(self, root, depth):
        left, right = root['groups']
        del(root['groups'])

        if not left or not right:
            root['left'] = root['right'] = self.to_terminal(left + right)
            return

        if depth >= self.max_depth:
            root['left'], root['right'] = self.to_terminal(left), self.to_terminal(right)
            return

        if len(left) <= self.min_size:
            root['left'] = self.to_terminal(left)
        else:
            root['left'] = self.get_split(left)
            self.split(root['left'], depth+1)

        if len(right) <= self.min_size:
            root['right'] = self.to_terminal(right)
        else:
            root['right'] = self.get_split(right)
            self.split(root['right'], depth+1)
    
    def build_tree(self, sample):
        root = self.get_split(sample)
        self.split(root, 1)
        return root
    
    def predict_one(self, node, row):
        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict):
                return self.predict_one(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self.predict_one(node['right'], row)
            else:
                return node['right']
    
    def bagging_predict(self, trees, row):
        predictions = [self.predict_one(tree, row[1]) for tree in trees]
        return max(set(predictions), key=predictions.count)
 
    ######################
    

    def __init__(self, train_data=x_train_total, y="RainTomorrow",
                 max_depth=3, min_size=1, n_trees=1):
        self.data = train_data
        self.length = len(self.data)
        self.cols = list(train_data.columns.values)
        self.y = y
        self.max_depth = max_depth
        self.min_size = min_size
        self.n_trees = n_trees
        self.n_sample = round(self.length * n_trees)
        self.train()
        return

    def train(self):
        self.trees = []
        for i in range(self.n_trees):
            sample = self.subsample()
            tree = self.build_tree(sample)
            self.trees.append(tree)
  
    def predict(self, test=x_test):
        predictions = [self.bagging_predict(self.trees, row) for row in test.iterrows()]
        return predictions

---
# RESULTS
---

In [None]:
RF = RandomForest(train_data=x_train_total.iloc[0:100])
eval([[RF, "Random Forest"]], x_test, y_test['RainTomorrow'])