In [1]:
import numpy as np
import pandas as pd

In [2]:
# Assigning features and label variables
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny','Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']
play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

print ('Weather: ', weather)
print ('Temperature: ', temp)
print ('Play?:', play)

Weather:  ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy', 'Sunny', 'Overcast', 'Overcast', 'Rainy']
Temperature:  ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild']
Play?: ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']


In [3]:
features = pd.DataFrame({'weather': weather, 'temp': temp}, columns=['weather', 'temp'])
features.head()

Unnamed: 0,weather,temp
0,Sunny,Hot
1,Sunny,Hot
2,Overcast,Hot
3,Rainy,Mild
4,Rainy,Cool


In [4]:
# Import LabelEncoder
from sklearn import preprocessing
#creating labelEncoder
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
features.weather = le.fit_transform(features.weather)
features.temp = le.fit_transform(features.temp)
label=le.fit_transform(play)

print ('Features: \n', features.head())
print ('Temp: ', label)

Features: 
    weather  temp
0        2     1
1        2     1
2        0     1
3        1     2
4        1     0
Temp:  [0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [15]:
# Explanation of priors code
"""n_samples, n_features = features.shape
classes = np.unique(label)
n_classes = len(classes)
prior = np.zeros(n_classes, dtype=np.float64)
idx,c = classes
X_c = features[label==c]
prior[idx] = X_c.shape[0]/float(n_samples)
print(prior[idx])"""

14 0.6428571428571429


In [5]:
class NaiveBayes:
    """
    X = features
    y = label
    idx = index
    c = class
    """
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64) # to obtain mean value
        self._var = np.zeros((n_classes, n_features), dtype=np.float64) # to obtain varians
        self._priors = np.zeros(n_classes, dtype=np.float64) # to obtain the priors value of the label (y)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)
            
    def _pdf(self, class_idx, x): # pdf = probability density function = Gaussian Naive Bayes is used in this program
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator
    
    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = posterior + prior
            posteriors.append(posterior)

        # return class with the highest posterior
        return self._classes[np.argmax(posteriors)]

# Source: https://www.youtube.com/watch?v=TLInuAorxqE

In [6]:
x = NaiveBayes()
x.fit(features, label)

In [7]:
x._classes

array([0, 1], dtype=int64)

In [7]:
print("Mean: \n", x._mean)
print("\nVariance: \n", x._var)
print("\nPriors: \n", x._priors)

Mean: 
 [[1.6        1.2       ]
 [0.77777778 1.11111111]]

Variance: 
 [[0.3        0.7       ]
 [0.69444444 0.86111111]]

Priors: 
 [0.35714286 0.64285714]


In [8]:
prediction = x._predict([[0,2]]) # 0:Overcast, 2:Mild
print ("Predicted Value:", prediction) # 0:No play, 1:Play

Predicted Value: 1


#### Calculate the accuracy of the model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score

X, y = datasets.make_classification(n_samples=1000,
                                   n_features=10, n_classes=2,
                                   random_state=123)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2, random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print("Naive Bayes classification accuracy", accuracy_score(predictions, y_test)*100, '%')

Naive Bayes classification accuracy 96.5 %
