# Cloudy With A Chance of Burst!☁️💥
## Gassian NB Classifier

In [3]:
##imports
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

### Load the Dataset and Preprocess

In [4]:
dataset = pd.read_csv('cloudpredictionsystemproject.csv')
dataset = dataset.drop(['Date', 'Evaporation', 'Sunshine'], axis=1)

# Fill numeric column NAs with mean
numeric_columns = dataset.select_dtypes(include=np.number).columns
dataset[numeric_columns] = dataset[numeric_columns].fillna(round(dataset[numeric_columns].mean(),2))

# Fill non-numeric column NAs with forward fill
non_numeric_columns = dataset.select_dtypes(exclude=np.number).columns
dataset[non_numeric_columns] = dataset[non_numeric_columns].fillna(method='ffill')

dataset

Unnamed: 0,Location,MinimumTemperature,MaximumTemperature,Rainfall,WindGustDirection,WindGustSpeed,WindDirection9am,WindDirection3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temperature9am,Temperature3pm,CloudBurst Today,CloudBurstTomorrow
0,Albury,13.4,22.90,0.6,W,44.00,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.00,4.51,16.9,21.8,No,No
1,Albury,7.4,25.10,0.0,WNW,44.00,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,4.45,4.51,17.2,24.3,No,No
2,Albury,12.9,25.70,0.0,WSW,46.00,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,4.45,2.00,21.0,23.2,No,No
3,Albury,9.2,28.00,0.0,NE,24.00,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,4.45,4.51,18.1,26.5,No,No
4,Albury,17.5,32.30,1.0,W,41.00,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.00,8.00,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,Uluru,2.8,23.40,0.0,E,31.00,SE,ENE,13.0,11.0,51.0,24.0,1024.6,1020.3,4.45,4.51,10.1,22.4,No,No
145456,Uluru,3.6,25.30,0.0,NNW,22.00,SE,N,13.0,9.0,56.0,21.0,1023.5,1019.1,4.45,4.51,10.9,24.5,No,No
145457,Uluru,5.4,26.90,0.0,N,37.00,SE,WNW,9.0,9.0,53.0,24.0,1021.0,1016.8,4.45,4.51,12.5,26.1,No,No
145458,Uluru,7.8,27.00,0.0,SE,28.00,SSE,N,13.0,7.0,51.0,24.0,1019.4,1016.5,3.00,2.00,15.1,26.0,No,No


In [5]:
#convert to numpy
dataset = dataset.to_numpy()

### Shuffle dataset and split into train and test

In [6]:
np.random.shuffle(dataset)

train_size = int(0.7 * len(dataset))

X_train = dataset[:train_size,:-1]
Y_train = dataset[:,-1]

X_test = dataset[train_size:,:-1]
Y_test = dataset[train_size:,-1]

### Calculate Prior Probabilities

In [7]:
# Cloudburst
prior_yes = np.sum(Y_train == "Yes")/len(Y_train)

# No cloudburst
prior_no = np.sum(Y_train == "No")/len(Y_train)

prior_yes,  prior_no

(0.2252096796370136, 0.7747903203629863)

### Calculate Normal Vector given the parameters

In [8]:
total_yes = np.sum(Y_train == "Yes")
total_no = np.sum(Y_train == "No")

def gaussian(mean, std, x):
    return (1/(np.sqrt(2*np.pi)*std**2)) * np.exp(-0.5* ((x-mean))/(std)**2)

def conditional_probability(row):
    yes = []
    no = []
    for i in range(len(row)):
        if type(row[i]) == float:
            valyes=[]
            valno=[]
            for j in range(len(X_train)):
                if Y_train[j] == "Yes":
                    valyes.append(X_train[:,i][j])
                else:
                    valno.append(X_train[:,i][j])
            meanyes = np.mean(valyes)
            stdyes = np.std(valyes)
            meanno = np.mean(valno)
            stdno = np.std(valno)
            yes.append(gaussian(meanyes, stdyes, row[i]))
            no.append(gaussian(meanno, stdno, row[i]))
        else:            
            countyes = 0
            countno = 0
            for j in range(len(X_train)):
                if Y_train[j] == "Yes" and X_train[:,i][j] == row[i]:
                    countyes += 1
                elif Y_train[j] == "No" and X_train[:,i][j] == row[i]:
                    countno += 1
            yes.append(countyes/total_yes)
            no.append(countno/total_no)

    return yes, no

In [11]:
def bayes(row):
    cond_yes, cond_no = conditional_probability(row)
    prob_yes = np.array(cond_yes) * prior_yes
    prob_no = np.array(cond_no) * prior_no
    return ["Yes" if prob_yes[i] > prob_no[i] else "No" for i in range(len(prob_yes))]

print(bayes(X_test[0]), Y_test[0])

['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No'] No


### Testing