# Gaussian Naive Bayes Classification


## Iris Dataset Exploration


### Import Required Libraries


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


### Loading and Understanding Dataset

In [3]:
from sklearn.datasets import load_iris


### Feature Matrix (X) and Target Vector (y)


In [4]:
X,y=load_iris(return_X_y=True)

In [5]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

### Split Data into Training and Testing Sets


In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

### Import Gaussian Naive Bayes Classifier


In [10]:
from sklearn.naive_bayes import GaussianNB

### Model Training

In [11]:
gnb=GaussianNB()

In [12]:
gnb.fit(X_train,y_train)

### Model Prediction

In [13]:
y_pred=gnb.predict(X_test)
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 2, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0])

### Model Evaluation Metrics

In [20]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

print("Accuracy score:",accuracy_score(y_pred,y_test))
print("Confusion matrix:",confusion_matrix(y_pred,y_test))
print("Classification report:",classification_report(y_pred,y_test))


Accuracy score: 0.9777777777777777
Confusion matrix: [[19  0  0]
 [ 0 12  0]
 [ 0  1 13]]
Classification report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.92      1.00      0.96        12
           2       1.00      0.93      0.96        14

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.97        45
weighted avg       0.98      0.98      0.98        45



## Tips Dataset Exploration


In [23]:
import seaborn as sns

df=sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [24]:
df['day'].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

### Encode Categorical Features


In [26]:
from sklearn.preprocessing import LabelEncoder

le1=LabelEncoder()
le2=LabelEncoder()
le3=LabelEncoder()


In [27]:
df['sex']=le1.fit_transform(df['sex'])
df['smoker']=le2.fit_transform(df['smoker'])


In [35]:
le1.classes_

array(['Female', 'Male'], dtype=object)

In [33]:
df = pd.get_dummies(df, columns=['day'], drop_first=True)
df

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Fri,day_Sat,day_Sun
0,16.99,1.01,0,0,Dinner,2,False,False,True
1,10.34,1.66,1,0,Dinner,3,False,False,True
2,21.01,3.50,1,0,Dinner,3,False,False,True
3,23.68,3.31,1,0,Dinner,2,False,False,True
4,24.59,3.61,0,0,Dinner,4,False,False,True
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,Dinner,3,False,True,False
240,27.18,2.00,0,1,Dinner,2,False,True,False
241,22.67,2.00,1,1,Dinner,2,False,True,False
242,17.82,1.75,1,0,Dinner,2,False,True,False


In [34]:
day_cols = ['day_Fri', 'day_Sat', 'day_Sun']
df[day_cols] = df[day_cols].astype(int)

df

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Fri,day_Sat,day_Sun
0,16.99,1.01,0,0,Dinner,2,0,0,1
1,10.34,1.66,1,0,Dinner,3,0,0,1
2,21.01,3.50,1,0,Dinner,3,0,0,1
3,23.68,3.31,1,0,Dinner,2,0,0,1
4,24.59,3.61,0,0,Dinner,4,0,0,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,Dinner,3,0,1,0
240,27.18,2.00,0,1,Dinner,2,0,1,0
241,22.67,2.00,1,1,Dinner,2,0,1,0
242,17.82,1.75,1,0,Dinner,2,0,1,0


In [36]:
df['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [37]:
df['time']=df['time'].map({'Dinner':0,'Lunch':1})
df

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Fri,day_Sat,day_Sun
0,16.99,1.01,0,0,0,2,0,0,1
1,10.34,1.66,1,0,0,3,0,0,1
2,21.01,3.50,1,0,0,3,0,0,1
3,23.68,3.31,1,0,0,2,0,0,1
4,24.59,3.61,0,0,0,4,0,0,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,0,3,0,1,0
240,27.18,2.00,0,1,0,2,0,1,0
241,22.67,2.00,1,1,0,2,0,1,0
242,17.82,1.75,1,0,0,2,0,1,0


In [38]:
X=df.drop('time',axis=1)
y=df['time']

In [39]:
X

Unnamed: 0,total_bill,tip,sex,smoker,size,day_Fri,day_Sat,day_Sun
0,16.99,1.01,0,0,2,0,0,1
1,10.34,1.66,1,0,3,0,0,1
2,21.01,3.50,1,0,3,0,0,1
3,23.68,3.31,1,0,2,0,0,1
4,24.59,3.61,0,0,4,0,0,1
...,...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,3,0,1,0
240,27.18,2.00,0,1,2,0,1,0
241,22.67,2.00,1,1,2,0,1,0
242,17.82,1.75,1,0,2,0,1,0


### Train Gaussian Naive Bayes on Tips Dataset


In [40]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [41]:
from sklearn.naive_bayes import GaussianNB

gnb=GaussianNB()

In [42]:
gnb.fit(X_train,y_train)

In [43]:
y_pred=gnb.predict(X_test)
y_pred

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0])

In [44]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

print("Accuracy score:",accuracy_score(y_pred,y_test))
print("Confusion matrix:",confusion_matrix(y_pred,y_test))
print("Classification report:",classification_report(y_pred,y_test))

Accuracy score: 0.9594594594594594
Confusion matrix: [[52  0]
 [ 3 19]]
Classification report:               precision    recall  f1-score   support

           0       0.95      1.00      0.97        52
           1       1.00      0.86      0.93        22

    accuracy                           0.96        74
   macro avg       0.97      0.93      0.95        74
weighted avg       0.96      0.96      0.96        74



### Prediction on New Input Data


In [62]:
new_total_bill=17.73
tip=3.00
sex='Female'
smoker='Yes'
size=2
day='Sat'

sex_enc = le1.transform([sex])[0]       # Female → 0
smoker_enc = le2.transform([smoker])[0] # Yes → 1

In [63]:
le1.classes_


array(['Female', 'Male'], dtype=object)

In [64]:
le2.classes_

array(['No', 'Yes'], dtype=object)

In [65]:
day_Fri = 0
day_Sat = 1
day_Sun = 0

In [66]:
X_new = [[
    17.73,   # total_bill
    3.00,    # tip
    0,       # sex (Female)
    1,       # smoker (Yes)
    2,       # size
    0,       # day_Fri
    1,       # day_Sat
    0        # day_Sun
]]


### Final Prediction Result


dinner->0
lunch->1

In [67]:
pred =gnb.predict(X_new)

print("predicted values:",pred[0])

predicted values: 0


