<a href="https://colab.research.google.com/github/bdugick89/Data-Science-Bootcamp/blob/main/4c1_worked_example_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Pandas, plotting and [Sklearn](https://scikit-learn.org/stable/) packages.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing

## Naive Bayes by hand


### Copy the data from the lecture example

In [2]:
mixed = """
Weather
Play
Rainy
No
Sunny
Yes
Overcast
Yes
Overcast
No
Sunny
Yes
Rainy
No
Rainy
No
Sunny
Yes
Overcast
Yes
Overcast
Yes
Rainy
No
Rainy
Yes

""".split()
mixed


['Weather',
 'Play',
 'Rainy',
 'No',
 'Sunny',
 'Yes',
 'Overcast',
 'Yes',
 'Overcast',
 'No',
 'Sunny',
 'Yes',
 'Rainy',
 'No',
 'Rainy',
 'No',
 'Sunny',
 'Yes',
 'Overcast',
 'Yes',
 'Overcast',
 'Yes',
 'Rainy',
 'No',
 'Rainy',
 'Yes']

In [3]:
weather = mixed[2::2]
play = mixed[3::2]
print( weather )
print( play )

['Rainy', 'Sunny', 'Overcast', 'Overcast', 'Sunny', 'Rainy', 'Rainy', 'Sunny', 'Overcast', 'Overcast', 'Rainy', 'Rainy']
['No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes']


In [4]:
tennis_df = pd.DataFrame()
tennis_df

In [5]:
tennis_df["Weather"] = weather
tennis_df

Unnamed: 0,Weather
0,Rainy
1,Sunny
2,Overcast
3,Overcast
4,Sunny
5,Rainy
6,Rainy
7,Sunny
8,Overcast
9,Overcast


In [6]:
tennis_df["Play"] = play
tennis_df

Unnamed: 0,Weather,Play
0,Rainy,No
1,Sunny,Yes
2,Overcast,Yes
3,Overcast,No
4,Sunny,Yes
5,Rainy,No
6,Rainy,No
7,Sunny,Yes
8,Overcast,Yes
9,Overcast,Yes


### Weather probabilities

In [7]:
weather_counts = tennis_df["Weather"].value_counts()
weather_counts

Rainy       5
Overcast    4
Sunny       3
Name: Weather, dtype: int64

In [8]:
# weather_total = len(tennis_df["Weather"])
weather_total = weather_counts.sum()
weather_total


12

In [9]:
p_weather = weather_counts / weather_total
p_weather

Rainy       0.416667
Overcast    0.333333
Sunny       0.250000
Name: Weather, dtype: float64

### Play probabilities

In [10]:
play_counts = tennis_df["Play"].value_counts().sort_index()
play_counts

No     5
Yes    7
Name: Play, dtype: int64

In [11]:
play_total = play_counts.sum()
play_total

12

In [12]:
p_play = play_counts/play_total
p_play

No     0.416667
Yes    0.583333
Name: Play, dtype: float64

### Conditional probabilities

In [13]:
cond_counts = tennis_df.value_counts()
cond_counts

Weather   Play
Rainy     No      4
Overcast  Yes     3
Sunny     Yes     3
Overcast  No      1
Rainy     Yes     1
dtype: int64

In [14]:
cond_counts["Overcast", "Yes"]

3

### Bayes theorem

P(Yes|Overcast) = P(Overcase|Yes) * P(Yes) / P(Overcast)

In [15]:
p_overcast_yes = cond_counts["Overcast", "Yes"]/play_counts["Yes"]
p_overcast_yes

0.42857142857142855

In [16]:
p_yes = p_play["Yes"]
p_yes

0.5833333333333334

In [17]:
p_overcast = p_weather["Overcast"]
p_overcast

0.3333333333333333

In [18]:
p_yes_overcast = p_overcast_yes * p_yes / p_overcast
p_yes_overcast

0.75

In [19]:
109823740981723407891230948701239874012398734098127304987213098470912837409872134897012987340897123400000+1

109823740981723407891230948701239874012398734098127304987213098470912837409872134897012987340897123400001

In [20]:
float(109823740981723407891230948701239874012398734098127304987213098470912837409872134897012987340897123400000)

1.0982374098172341e+104

In [21]:
float(109823740981723407891230948701239874012398734098127304987213098470912837409872134897012987340897123400000)+1


1.0982374098172341e+104

In [22]:
int(float(109823740981723407891230948701239874012398734098127304987213098470912837409872134897012987340897123400000))-109823740981723407891230948701239874012398734098127304987213098470912837409872134897012987340897123400000

1152566635431384724391760522603672167805625340388673786876480625124286845754148777789120

## Gaussian Naive Bayes

In [23]:
# Separarate target and features
features = tennis_df.drop(columns = "Play")
target = tennis_df["Play"]

In [24]:
# Create labelEncoder
le = preprocessing.LabelEncoder()

# Convert string labels into numbers. Note: this isn't the best way to encode
# weather but we will talk about other encoding methods later on
weather_encoded = le.fit_transform(features["Weather"])
print('Weather:', weather_encoded)


Weather: [1 2 0 0 2 1 1 2 0 0 1 1]


In [25]:

label_encoded = le.fit_transform(target)
print('Label:', label_encoded)

Label: [0 1 1 0 1 0 0 1 1 1 0 1]


In [26]:
#Create a Multinomial Naive Bayes Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(weather_encoded.reshape(-1,1), label_encoded)

#Predict Output
predicted = model.predict( [[0]] )     # 0: Overcast
print( "Predicted Value:", predicted ) # 0: No    1: Yes


Predicted Value: [1]


In [None]:
model.predict_proba([[0]])

array([[0.28619764, 0.71380236]])

In [27]:
weather_encoded.reshape(-1,1)

array([[1],
       [2],
       [0],
       [0],
       [2],
       [1],
       [1],
       [2],
       [0],
       [0],
       [1],
       [1]])

In [28]:
weather_encoded.reshape(-1,1).shape

(12, 1)