---
# Data Processing
---

### Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OrdinalEncoder
from random import seed
from random import randrange

In [3]:
# Upload
X_und = pd.read_csv("cleaned_rain_x.csv")
y_und = pd.read_csv("cleaned_rain_y.csv")

---
### Split into testing and training data

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X_und, y_und, test_size=0.25, random_state=42)

---
# Machine Learning Models
---

## Logistic Regression Implementation

In [75]:
# Add bias column
def add_const(self, dataframe):
    n, k = dataframe.shape
    ones = np.ones((n, 1))
    return np.concatenate([ones, dataframe], axis = 1)

def prediction(self, values):
    if values >= 0.99:
        return 1
    else:
        return 0

In [81]:
class GDLogReg:
    def __init__(self):
        self.theta = None

    def predict(self, X):
        X = add_const(X)
        predicted_values = X @ self.theta
        
        return predicted_values

    def fit(self, X, y, l_rate=.01, epochs=100, epsilon=.0001, log=False):

        # Add bias column
        X = add_const(X)
        
        n, d = X.shape

        # 1. initialize theta at random
        self.theta = np.zeros(d)

        # 2. repeat until stopping conditions
        for epoch in range(epochs):
            theta_old = self.theta
            
            # Apply sigmoid function
            h_theta = (1/ (1 + np.exp(-(X @ self.theta))))
            residuals = h_theta - y 
            gradient = residuals @ X

            # 3. Update theta
            self.theta = theta_old - l_rate * gradient
            difference = np.linalg.norm(self.theta - theta_old)

            if log:
                print(f'iter: {epoch}\tdif: {difference}')

            # Check for convergence
            if epsilon > difference:
                break 

        return self

lr_gd = GDLogReg()
lr_gd.fit(x_train, y_train, l_rate=.01, epsilon=1e-6, log = False)
pd.Series(lr_gd.theta, index = ['constant'] + list(x_train))

  h_theta = (1/ (1 + np.exp(-(X @ self.theta))))


constant            140.7950
MinTemp           13264.2810
MaxTemp          -23030.3070
WindGustSpeed     76666.5300
WindSpeed9am      18793.2050
WindSpeed3pm      20391.6950
Humidity9am      129631.7050
Humidity3pm      223517.8200
Pressure9am      104485.9265
RainToday          3003.7700
year             282751.3950
month               941.0800
day                3389.3000
WindGustDirN        611.8350
WindGustDirE       -923.7700
WindGustDirS       -425.2600
WindGustDirW        870.4600
WindDir9amN        1029.8750
WindDir9amE        -908.8450
WindDir9amS        -711.8200
WindDir9amW         699.1350
WindDir3pmN         755.1350
WindDir3pmE        -731.8600
WindDir3pmS        -609.8200
WindDir3pmW         695.2400
dtype: float64

In [82]:
def get_indexed(dictionary):
    
    return pd.DataFrame(dictionary).set_index(['l_rate', 'epochs'])

    l_rates, epochs = [.01, .1, .5], [10, 50, 100]
    cross_entropy, metrics = [], []

    for l_rate in l_rates:
        for epoc in epochs:
            lr_gd = GDLogReg().fit(x_train, y_train, l_rate, epoc)

            train_pred = lr_gd.predict(x_train)
            test_pred = lr_gd.predict(x_test)

        
        # Cross entropy loss objective
        params_cross_entropy = {
            'l_rate': l_rate,
            'epochs': epoc
        }
        
        params_cross_entropy.update(
                                    {
                                        'cross_entropy_train': log_loss(y_train, train_pred),
                                        'cross_entropy_test': log_loss(y_test, test_pred)
                                    })
        cross_entropy.append(params_cross_entropy)
        
        # Metrics for 100 iterations
        params_metrics = {
            'l_rate': l_rate,
            'epochs': epoc
        }
        
        if epoc == 100:
            
            params_metrics.update(
                                    {
                                     'accuracy_train': accuracy_score(y_train, train_pred),
                                     'accuracy_test': accuracy_score(y_test, test_pred),
                                     'precision_train': precision_score(y_train, train_pred),
                                     'precision_test': precision_score(y_test, test_pred),
                                     'recall_train': recall_score(y_train, train_pred),
                                     'recall_test': recall_score(y_test, test_pred),
                                     'f1_train': f1_score(y_train, train_pred),
                                     'f1_test': f1_score(y_test, test_pred)
                                    })
            metrics.append(params_metrics)

In [83]:
def get_indexed(dictionary):
    
    return pd.DataFrame(dictionary).set_index(['l_rate', 'epochs'])

In [84]:
get_indexed(cross_entropy)

Unnamed: 0_level_0,Unnamed: 1_level_0,cross_entropy_train,cross_entropy_test
l_rate,epochs,Unnamed: 2_level_1,Unnamed: 3_level_1
0.01,100,17.315379,17.133016
0.1,100,17.315379,17.133016
0.5,100,17.315379,17.133016


In [85]:
get_indexed(metrics)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy_train,accuracy_test,precision_train,precision_test,recall_train,recall_test,f1_train,f1_test
l_rate,epochs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.01,100,0.49868,0.50396,0.49868,0.50396,1.0,1.0,0.665492,0.670177
0.1,100,0.49868,0.50396,0.49868,0.50396,1.0,1.0,0.665492,0.670177
0.5,100,0.49868,0.50396,0.49868,0.50396,1.0,1.0,0.665492,0.670177


## Naive Bayes Implementation

In [5]:
x_train_total = pd.concat([x_train, y_train], axis=1)
x_train_total.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
x_train_total.head()

Unnamed: 0,MinTemp,MaxTemp,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,RainToday,year,...,WindGustDirW,WindDir9amN,WindDir9amE,WindDir9amS,WindDir9amW,WindDir3pmN,WindDir3pmE,WindDir3pmS,WindDir3pmW,RainTomorrow
22845,7.2,13.5,46.0,17.0,31.0,76.0,61.0,1021.8,1,2015,...,1,0,0,0,1,0,0,1,1,0
42446,21.2,28.8,48.0,19.0,24.0,78.0,97.0,1009.6,0,2014,...,1,1,1,0,0,0,0,0,1,1
5989,16.8,28.0,39.0,19.0,22.0,74.0,60.0,1012.4,0,2015,...,0,0,0,1,1,0,1,1,0,0
39347,12.5,24.4,50.0,6.0,7.0,78.0,43.0,1014.6,0,2014,...,0,0,0,1,1,0,1,1,0,1
31998,6.0,17.2,33.0,9.0,22.0,95.0,55.0,1029.4,1,2009,...,0,1,0,0,1,0,1,1,0,1


In [38]:
class NaiveBayes():

    def __init__(self, train_data=x_train_total, y="RainTomorrow"):
        self.data = train_data
        self.length = len(self.data)
        self.cols = list(train_data.columns.values)
        self.y=y
        self.cols.remove(self.y)
        self.probs = {}
        self.conds = {}
        self.train()
        return

    def train(self):
        # make probs for e and p!
        self.probs = self.data.groupby(self.y).size().div(self.length)

        # make a dict for each classification
        for classification in self.data[self.y].unique():
            self.conds[classification] = {}

        # get conditional probabilities for the rest of the data
        for mushroom_quality in self.cols: # https://stackoverflow.com/questions/37818063/how-to-calculate-conditional-probability-of-values-in-dataframe-pandas-python

            class_probs = self.data.groupby(mushroom_quality).size().div(self.length)
            conditionals = self.data.groupby([self.y, mushroom_quality]).size().div(self.length).div(class_probs, axis=0, level=mushroom_quality)

            for classification in self.data[self.y].unique():
                self.conds[classification][mushroom_quality] = {}
                for quality_type in conditionals[classification].keys():
                    self.conds[classification][mushroom_quality][quality_type] = conditionals[classification][quality_type]

        return
  
    def predict(self, input=x_test):
        output = []
        for index, row in input.iterrows():
            guess = {}
            for edability in self.data[self.y].unique():
                guess[edability] = 1
                for mushroom_quality in self.cols:
                    try:
                        quality_in_class = self.conds[edability][mushroom_quality][row[mushroom_quality]]
                    except:
                        quality_in_class = (1 / self.length + len(self.cols))
          
                    guess[edability] += np.log(quality_in_class)
                guess[edability] += np.log(self.probs[edability])
            output.append(max(guess, key=guess.get))
        return output

In [39]:
NB = NaiveBayes()

In [43]:
guesses = NB.predict()
print(guesses[1:8])
print(list(y_test.iloc[1:8]))

[1, 0, 1, 0, 0, 0, 1]
['Unnamed: 0', 'RainTomorrow']


In [41]:
def TF(y_actu, y_pred):
    TP = np.sum(np.logical_and(y_pred == 1, y_actu == 1))
    TN = np.sum(np.logical_and(y_pred == 0, y_actu == 0))
    FP = np.sum(np.logical_and(y_pred == 1, y_actu == 0))
    FN = np.sum(np.logical_and(y_pred == 0, y_actu == 1))
    return TP, FP, TN, FN

In [42]:
TP, FP, TN, FN = TF(y_test, np.array(guesses))
TP, FP, TN, FN

ValueError: operands could not be broadcast together with shapes (12753,) (12753,2) 

In [35]:
y_test

Unnamed: 0.1,Unnamed: 0,RainTomorrow
19893,19893,0
36271,36271,1
43781,43781,1
27999,27999,1
21025,21025,0
...,...,...
16507,16507,0
35878,35878,1
4939,4939,0
12341,12341,0


In [47]:
acc = (TP + TN) / (TP + TN + FP + FN)
pre = TP / (TP + FP)
re = TP / (FN + TP)
f1 = (2 * (pre * re)) / (pre + re)
NB_scoring = {"NB hand-coded":{"Accuracy":acc, "Precision":pre,"Recall":re,"F1":f1}}

In [48]:
pd.DataFrame(NB_scoring)

Unnamed: 0,NB hand-coded
Accuracy,0.741081
F1,0.733452
Precision,0.76212
Recall,0.706862


In [32]:
class random_forest_classifier():
    
    ## Helper Functions ##
    
    def subsample(self):
        sample = []
        while len(sample) < self.n_sample:
            index = randrange(self.length)
            sample.append(self.data.iloc[index])
        return sample
    
    def test_split(self, index, value, sample):
        left, right = [], []
        for row in sample:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right
    
    def gini_index(self, groups, classes):
        n_instances = float(sum([len(group) for group in groups]))
        gini = 0.0
        for group in groups:
            size = float(len(group))
            if size == 0:
                continue
            score = 0.0
            for class_val in classes:
                p = [row[-1] for row in group].count(class_val) / size
                score += p * p
            gini += (1.0 - score) * (size / n_instances)
        return gini
    
    def get_split(self, sample):
        class_vals = [set(row[-1] for row in sample)]
        b_index, b_value, b_score, b_groups = 999, 999, 999, None
        features = []
        while len(features) < self.length:
            index = randrange(len(sample[0])-1)
            if index not in features:
                features.append(index)
        for index in features:
            for row in sample:
                groups = self.test_split(index, row[index], sample)
                gini = self.gini_index(groups, class_vals)
                if gini < b_score:
                    b_index, b_value, b_score, b_groups = index, row[index], gini, groups
        return {'index':b_index, 'value':b_value, 'groups':b_groups}
    
    def to_terminal(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)
    
    def split(self, root, depth):
        left, right = root['groups']
        del(root['groups'])

        if not left or not right:
            root['left'] = root['right'] = self.to_terminal(left + right)
            return

        if depth >= self.max_depth:
            root['left'], root['right'] = self.to_terminal(left), self.to_terminal(right)
            return

        if len(left) <= self.min_size:
            root['left'] = self.to_terminal(left)
        else:
            root['left'] = self.get_split(left)
            self.split(root['left'], depth+1)

        if len(right) <= self.min_size:
            root['right'] = self.to_terminal(right)
        else:
            root['right'] = self.get_split(right)
            self.split(root['right'], depth+1)
    
    def build_tree(self, sample):
        root = self.get_split(sample)
        self.split(root, 1)
        return root
    
    def predict_one(self, node, row):
        print(row)
        print(node)
        if row[node['index']] < node['value']:
            if isinstance(node['left'], dict):
                return self.predict_one(node['left'], row)
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self.predict_one(node['right'], row)
            else:
                return node['right']
    
    def bagging_predict(self, trees, row):
        predictions = [self.predict_one(tree, row) for tree in trees]
        return max(set(predictions), key=predictions.count)
 
    ######################
    

    def __init__(self, train_data=x_train_total, y="RainTomorrow",
                 max_depth=3, min_size=1, n_trees=1):
        self.data = train_data
        self.length = len(self.data)
        self.cols = list(train_data.columns.values)
        self.y = y
        self.max_depth = max_depth
        self.min_size = min_size
        self.n_trees = n_trees
        self.n_sample = round(self.length * n_trees)
        self.train()
        return

    def train(self):
        self.trees = []
        for i in range(self.n_trees):
            sample = self.subsample()
            tree = self.build_tree(sample)
            self.trees.append(tree)
  
    def predict(self, test=x_test):
        predictions = [self.bagging_predict(self.trees, row) for row in test.iterrows()]
        return predictions

In [33]:
RFC = random_forest_classifier(train_data=x_train_total.iloc[0:10])

In [34]:
p = RFC.predict()
p

(19893, Unnamed: 0       19893.0
MinTemp             13.3
MaxTemp             27.3
WindGustSpeed       39.0
WindSpeed9am        19.0
WindSpeed3pm        11.0
Humidity9am         65.0
Humidity3pm         52.0
Pressure9am       1022.3
RainToday            0.0
year              2011.0
month                2.0
day                 14.0
WindGustDirN         0.0
WindGustDirE         1.0
WindGustDirS         1.0
WindGustDirW         0.0
WindDir9amN          0.0
WindDir9amE          1.0
WindDir9amS          1.0
WindDir9amW          0.0
WindDir3pmN          0.0
WindDir3pmE          1.0
WindDir3pmS          0.0
WindDir3pmW          0.0
Name: 19893, dtype: float64)
{'index': 13, 'value': 0.0, 'left': 1.0, 'right': 1.0}


IndexError: tuple index out of range

In [6]:
x_train_total.iloc[0:1000]

Unnamed: 0,MinTemp,MaxTemp,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,RainToday,year,...,WindGustDirW,WindDir9amN,WindDir9amE,WindDir9amS,WindDir9amW,WindDir3pmN,WindDir3pmE,WindDir3pmS,WindDir3pmW,RainTomorrow
22845,7.2,13.5,46.0,17.0,31.0,76.0,61.0,1021.8,1,2015,...,1,0,0,0,1,0,0,1,1,0
42446,21.2,28.8,48.0,19.0,24.0,78.0,97.0,1009.6,0,2014,...,1,1,1,0,0,0,0,0,1,1
5989,16.8,28.0,39.0,19.0,22.0,74.0,60.0,1012.4,0,2015,...,0,0,0,1,1,0,1,1,0,0
39347,12.5,24.4,50.0,6.0,7.0,78.0,43.0,1014.6,0,2014,...,0,0,0,1,1,0,1,1,0,1
31998,6.0,17.2,33.0,9.0,22.0,95.0,55.0,1029.4,1,2009,...,0,1,0,0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47092,8.7,14.9,54.0,20.0,26.0,88.0,44.0,1012.5,1,2016,...,1,0,0,1,0,0,0,0,1,1
5030,3.9,21.0,37.0,4.0,20.0,44.0,23.0,1021.7,0,2009,...,0,0,0,1,1,0,1,1,0,0
26012,13.1,29.6,39.0,6.0,7.0,67.0,50.0,1006.6,0,2016,...,1,0,1,1,0,0,1,1,0,1
41356,24.0,31.3,35.0,13.0,24.0,75.0,71.0,1009.1,1,2012,...,0,0,0,1,0,1,1,0,0,1
