### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

---
### Upload Dataset

In [2]:
X_und = pd.read_csv("cleaned_rain_x.csv")
y_und = pd.read_csv("cleaned_rain_y.csv")

---
### Split into testing and training data

In [3]:
x_train, x_test, y_train, y_test = train_test_split(X_und, y_und, test_size=0.25, random_state=42)

---
### Create 'Total' Train DataFrame

In [4]:
x_train_total = pd.concat([x_train, y_train], axis=1)
x_train_total.drop(columns=['Unnamed: 0'], axis=1, inplace=True)
x_train_total.head()

Unnamed: 0,MinTemp,MaxTemp,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,RainToday,year,...,WindGustDirW,WindDir9amN,WindDir9amE,WindDir9amS,WindDir9amW,WindDir3pmN,WindDir3pmE,WindDir3pmS,WindDir3pmW,RainTomorrow
22845,7.2,13.5,46.0,17.0,31.0,76.0,61.0,1021.8,1,2015,...,1,0,0,0,1,0,0,1,1,0
42446,21.2,28.8,48.0,19.0,24.0,78.0,97.0,1009.6,0,2014,...,1,1,1,0,0,0,0,0,1,1
5989,16.8,28.0,39.0,19.0,22.0,74.0,60.0,1012.4,0,2015,...,0,0,0,1,1,0,1,1,0,0
39347,12.5,24.4,50.0,6.0,7.0,78.0,43.0,1014.6,0,2014,...,0,0,0,1,1,0,1,1,0,1
31998,6.0,17.2,33.0,9.0,22.0,95.0,55.0,1029.4,1,2009,...,0,1,0,0,1,0,1,1,0,1


---
# Evaluators
---

In [5]:
def TF(y_actu, y_pred):
    TP = np.sum(np.logical_and(y_pred == 1, y_actu == 1))
    TN = np.sum(np.logical_and(y_pred == 0, y_actu == 0))
    FP = np.sum(np.logical_and(y_pred == 1, y_actu == 0))
    FN = np.sum(np.logical_and(y_pred == 0, y_actu == 1))
    return TP, FP, TN, FN

In [6]:
def eval(models, test_x, test_y):
    for m in models:
        model = m[0]
        TP, FP, TN, FN = TF(test_y, np.array(model.predict(test_x)))
        acc = (TP + TN) / (TP + TN + FP + FN)
        pre = TP / (TP + FP)
        re = TP / (FN + TP)
        f1 = (2 * (pre * re)) / (pre + re)
        scoring = {m[1]:{"Accuracy":acc, "Precision":pre,"Recall":re,"F1":f1}}
    return pd.DataFrame(scoring)

---
# Naive Bayes Implementation
---

In [7]:
class NaiveBayes():

    def __init__(self, train_data=x_train_total, y="RainTomorrow"):
        self.data = train_data
        self.length = len(self.data)
        self.cols = list(train_data.columns.values)
        self.y=y
        self.cols.remove(self.y)
        self.probs = {}
        self.conds = {}
        self.train()
        return

    def train(self):
        # make probs for e and p!
        self.probs = self.data.groupby(self.y).size().div(self.length)

        # make a dict for each classification
        for classification in self.data[self.y].unique():
            self.conds[classification] = {}

        # get conditional probabilities for the rest of the data
        for mushroom_quality in self.cols: # https://stackoverflow.com/questions/37818063/how-to-calculate-conditional-probability-of-values-in-dataframe-pandas-python

            class_probs = self.data.groupby(mushroom_quality).size().div(self.length)
            conditionals = self.data.groupby([self.y, mushroom_quality]).size().div(self.length).div(class_probs, axis=0, level=mushroom_quality)

            for classification in self.data[self.y].unique():
                self.conds[classification][mushroom_quality] = {}
                for quality_type in conditionals[classification].keys():
                    self.conds[classification][mushroom_quality][quality_type] = conditionals[classification][quality_type]

        return
  
    def predict(self, input=x_test):
        output = []
        for index, row in input.iterrows():
            guess = {}
            for edability in self.data[self.y].unique():
                guess[edability] = 1
                for mushroom_quality in self.cols:
                    try:
                        quality_in_class = self.conds[edability][mushroom_quality][row[mushroom_quality]]
                    except:
                        quality_in_class = (1 / self.length + len(self.cols))
          
                    guess[edability] += np.log(quality_in_class)
                guess[edability] += np.log(self.probs[edability])
            output.append(max(guess, key=guess.get))
        return output

---
# RESULTS
---

In [75]:
NB = NaiveBayes()
eval([[NB, "Naive Bayes"]], x_test, y_test['RainTomorrow'])

Unnamed: 0,Naive Bayes
Accuracy,0.741081
F1,0.733452
Precision,0.76212
Recall,0.706862
