<a href="https://colab.research.google.com/github/azkalltlhn/NaiveBayes_Algorithm/blob/main/Task/NaiveBayesAlgorithm_Azka_Lailatul_Hana.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#This python 3 environment comes with many helpful analytics libraries installed
#It is defined by the kagglep/python Docker image: https://github.com/kaggle/docker-python
#For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

**Load the dataset**

In [4]:
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


**Encode the features into Numerical data**

In [5]:
encoder = LabelEncoder()

# Apply the encoder to each of the columns
df = df.apply(encoder.fit_transform)

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


**Split the dataset into train and test parts**

In [6]:
# Seperating our target and features

X = df.drop(columns = ['class'])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

print("X_train = ", X_train.shape)
print("y_train = ", y_train.shape)
print("X_test = ", X_test.shape)
print("y_test = ", y_test.shape)

X_train =  (5686, 22)
y_train =  (5686,)
X_test =  (2438, 22)
y_test =  (2438,)


**Building Our Classifier**

In [7]:
# First, we calculate the prior probability which is just the percentage of data points belonging to the mentioned class
# For example, if our training dataset has 60% edible mushrooms, then the prior probability will be 0.6 when calculating in the testing side.

def prior(y_train, label):
    total_points = y_train.shape[0]
    class_points = np.sum(y_train == label)

    return class_points/float(total_points)

**Next, we will define a function to calculate the
conditional probability that we will use then to
calculate the likelihood**

In [8]:
def cond_prob(X_train, y_train, feat_col, feat_val, label):

    # Getting all the
    X_filtered = X_train[y_train == label]

    numerator = np.sum(X_filtered[feat_col] == feat_val)
    denominator = np.sum(y_train == label)

    return numerator/float(denominator)

In [9]:
## Now time to calculate the posterior probability and make predictions

def predict(X_train, y_train, xtest):

    # Get the number of target classes
    classes = np.unique(y_train)

    # All the features for our dataset
    features = [x for x in X_train.columns]

    # Compute posterior probabiites for each class
    post_probs = []

    for label in classes:

        # Since, posterior = prior * likelihood
        # We'll calculate likelihood by calculating the product of the conditional probabilities for each of the features

        likelihood = 1.0

        for f in features:
            cond = cond_prob(X_train, y_train, f, xtest[f], label)
            likelihood *= cond

        prior_prob = prior(y_train, label)

        posterior = prior_prob * likelihood

        post_probs.append(posterior)

    # Return the label for which the posterior probability was the maximum
    prediction = np.argmax(post_probs)

    return prediction

**Time to test our classifier**

In [10]:
#First, let's check on a random example

rand_example = 6

output = predict(X_train, y_train, X_test.iloc[rand_example])

print("Naive Bayes Classifier predicts ", output)
print("Current Answer ", y_test.iloc[rand_example])

Naive Bayes Classifier predicts  1
Current Answer  1


**Check The Results on Each of The Test Data Point and Calculate**

In [11]:
def accuracy_score(X_train, y_train, X_test, y_test):

  preds = []

  for i in range(X_test.shape[0]):
    pred_label = predict(X_train, y_train, X_test.iloc[i])
    preds.append(pred_label)

  preds = np.array(preds)

  accuracy = np.sum(preds == y_test)/y_test.shape[0]

  return accuracy

**Print Accuracy Score**

In [12]:
print("Accuracy Score for our classifier == ", accuracy_score(X_train, y_train, X_test, y_test))

Accuracy Score for our classifier ==  0.9958982772764561
