# $$\color{red}{\text{Machine Learning}}$$

$$\color{orange}{\text{Naive Bayes Classifier}}$$

$$\color{lime}{\text{Alireza Javid - 810198375}}$$

## $\color{deepskyblue}{\text{Import Libraries}}$

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm

## $\color{deepskyblue}{\text{Preprocessing}}$
In this part we take a close look at our dataset. As we can see some of our data is missing and we have 'x' for value of them. We can drop them or decide to fill them with mean or mode of the data. We decide to drop this rows and avoid adding unnecessary bias to our model. 

In [2]:
df = pd.read_csv("Data\penguins.csv")
df.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,39.1,18.7,181,3750
1,Adelie,39.5,17.4,186,3800
2,Adelie,40.3,18,195,3250
3,Adelie,x,x,x,x
4,Adelie,36.7,19.3,193,3450


In [3]:
df.drop(df[df["body_mass_g"] == 'x'].index,inplace=True)

## $\color{deepskyblue}{\text{Naive Bayes from Scratch}}$
Gaussian Naive Bayes is a powerful and efficient classification algorithm applicable to diverse tasks, that involves calculating prior probabilities for each class, fitting Gaussian distributions for each feature and class, calculating the likelihood of new data points belonging to each class, and selecting the class with the highest posterior probability as the final prediction, while adjusting algorithmic parameters to optimize performance.

In [4]:
prior = {}
for col in df.species.unique():
    prior[col] = df.groupby("species").count()["body_mass_g"][col] / df.shape[0]

To derive suitable normal distribution we need to calculate the mean and variance of each feature based on their class label.

In [5]:
def make_norm_map(df, colname, by='species'):
    norm_map = {}
    grouped = df.groupby(by)[colname]
    for species, group in grouped:
        mean = group.astype(float).mean()
        std = group.astype(float).std()
        norm_map[species] = norm(mean, std)
    return norm_map

In [6]:
maps = {}
maps['body_mass_g'] = make_norm_map(df, 'body_mass_g')
maps['flipper_length_mm'] = make_norm_map(df, 'flipper_length_mm')
maps['culmen_depth_mm'] = make_norm_map(df, 'culmen_depth_mm')
maps['culmen_length_mm'] = make_norm_map(df, 'culmen_length_mm')

Now it's time to compare posterior probabilities and make the decision.

In [7]:
def naive_decision(prior, data_seq, norm_maps, labels):
    posterior = {}
    for lable in labels:
        posterior[lable] = prior[lable]
        for c in maps.keys():
            posterior[lable] = posterior[lable] * norm_maps[c][lable].pdf(float(data_seq[c]))
    return max(posterior, key=lambda k: posterior.get(k))
    

In [8]:
df['Classification'] = np.nan
for i, row in df.iterrows():
    data_seq = row[maps.keys()]
    df.loc[i, 'Classification'] = naive_decision(prior, data_seq, maps, df.species.unique())

In [9]:
accuracy = df[df['Classification'] == df['species']].shape[0] / df.shape[0]
print("Accuracy: ", accuracy)

Accuracy:  0.9703264094955489


In [10]:
true_positives = {}
true_negatives = {}
false_positives = {}
false_negatives = {}

for lable in df.species.unique():
    true_positives[lable] = df[(df['Classification'] == lable) & (df['species'] == lable)].shape[0]
    true_negatives[lable] = df[(df['Classification'] != lable) & (df['species'] != lable)].shape[0]
    false_positives[lable] = df[(df['Classification'] == lable) & (df['species'] != lable)].shape[0]
    false_negatives[lable] = df[(df['Classification'] != lable) & (df['species'] == lable)].shape[0]

In [11]:
confusion_matrix = pd.DataFrame(np.array([[true_negatives['Adelie'], false_positives['Adelie']]
                                , [false_negatives['Adelie'], true_positives['Adelie']]]),
                                columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])

print(" Confusion Matrix for Adelie")
print(confusion_matrix)

precision = true_positives['Adelie'] / (true_positives['Adelie'] + false_positives['Adelie'])
recall = true_positives['Adelie'] / (true_positives['Adelie'] + false_negatives['Adelie'])

print("Precision: ", precision)
print("Recall: ", recall)

 Confusion Matrix for Adelie
                 Predicted Negative  Predicted Positive
Actual Negative                 182                   5
Actual Positive                   5                 145
Precision:  0.9666666666666667
Recall:  0.9666666666666667


In [12]:
confusion_matrix = pd.DataFrame(np.array([[true_negatives['Chinstrap'], false_positives['Chinstrap']]
                                , [false_negatives['Chinstrap'], true_positives['Chinstrap']]]),
                                columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])

print(" Confusion Matrix for Chinstrap")
print(confusion_matrix)

precision = true_positives['Chinstrap'] / (true_positives['Chinstrap'] + false_positives['Chinstrap'])
recall = true_positives['Chinstrap'] / (true_positives['Chinstrap'] + false_negatives['Chinstrap'])

print("Precision: ", precision)
print("Recall: ", recall)

 Confusion Matrix for Chinstrap
                 Predicted Negative  Predicted Positive
Actual Negative                 265                   5
Actual Positive                   5                  62
Precision:  0.9253731343283582
Recall:  0.9253731343283582


In [13]:
confusion_matrix = pd.DataFrame(np.array([[true_negatives['Gentoo'], false_positives['Gentoo']]
                                , [false_negatives['Gentoo'], true_positives['Gentoo']]]),
                                columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])

print(" Confusion Matrix for Gentoo")
print(confusion_matrix)

precision = true_positives['Gentoo'] / (true_positives['Gentoo'] + false_positives['Gentoo'])
recall = true_positives['Gentoo'] / (true_positives['Gentoo'] + false_negatives['Gentoo'])

print("Precision: ", precision)
print("Recall: ", recall)

 Confusion Matrix for Gentoo
                 Predicted Negative  Predicted Positive
Actual Negative                 217                   0
Actual Positive                   0                 120
Precision:  1.0
Recall:  1.0


## $\color{deepskyblue}{\text{Naive Bayes with Sklearn}}$

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
df = df.drop(["Classification"],axis=1)
X = df.drop(["species"],axis=1)
y = df["species"]
naive_bayes_model = GaussianNB()
y_pred = naive_bayes_model.fit(X, y).predict(X)
cm = confusion_matrix(y, y_pred)
print(cm)

report = classification_report(y, y_pred, target_names=df.species.unique())
print("Classification Report:")
print(report)

[[145   5   0]
 [  5  62   0]
 [  0   0 120]]
Classification Report:
              precision    recall  f1-score   support

      Adelie       0.97      0.97      0.97       150
   Chinstrap       0.93      0.93      0.93        67
      Gentoo       1.00      1.00      1.00       120

    accuracy                           0.97       337
   macro avg       0.96      0.96      0.96       337
weighted avg       0.97      0.97      0.97       337



In [15]:
for i, species in enumerate(df.species.unique()):
    tp = cm[i][i]
    tn = sum([sum(row) - row[i] for j, row in enumerate(cm) if j != i])
    fp = sum(cm[:, i]) - tp
    fn = sum(cm[i]) - tp
    confusion_matrix = pd.DataFrame(np.array([[tp, fp]
                                , [fn, tp]]),
                                columns=['Predicted Negative', 'Predicted Positive'], index=['Actual Negative', 'Actual Positive'])

    print(" Confusion Matrix for " + species)
    print(confusion_matrix)

 Confusion Matrix for Adelie
                 Predicted Negative  Predicted Positive
Actual Negative                 145                   5
Actual Positive                   5                 145
 Confusion Matrix for Chinstrap
                 Predicted Negative  Predicted Positive
Actual Negative                  62                   5
Actual Positive                   5                  62
 Confusion Matrix for Gentoo
                 Predicted Negative  Predicted Positive
Actual Negative                 120                   0
Actual Positive                   0                 120
