In [6]:
import seaborn as sns
import pandas as pd

In [None]:
# Importing the tips dataset from seaborn library
df = sns.load_dataset("tips")

In [38]:
# Set thresholds for binarization
threshold_total_bill = 20.0
threshold_tip = 3.0
threshold_size = df['size'].mean()
# Manually binarize the continuous features
df['total_bill_enc'] = (df['total_bill'] > threshold_total_bill).astype(int)
df['tip_enc'] = (df['tip'] > threshold_tip).astype(int)
df['size_enc'] = (df['size'] > threshold_size).astype(int)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_encoded,time_encoded,smoker_encoded,total_bill_enc,tip_enc,size_enc
0,16.99,1.01,Female,No,Sun,Dinner,2,0,0,0,0,0,0
1,10.34,1.66,Male,No,Sun,Dinner,3,1,0,0,0,0,1
2,21.01,3.5,Male,No,Sun,Dinner,3,1,0,0,1,1,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1,0,0,1,1,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,0,0,1,1,1


In [None]:
# predicting gender based on other features
# feature encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['sex_encoded'] = le.fit_transform(df['sex'])
df['time_encoded'] = le.fit_transform(df['time'])
df['smoker_encoded'] = le.fit_transform(df['smoker'])

day_dummies = pd.get_dummies(df["day"], prefix="day", drop_first=True, dtype=int)

In [40]:
df = pd.concat([df, day_dummies], axis=1)

In [44]:
# Since we are performing bernoulli naive bayes no need for continuos value features
df.drop(columns=['total_bill','tip','size','smoker','day','sex','time'], axis=1)

Unnamed: 0,sex_encoded,time_encoded,smoker_encoded,total_bill_enc,tip_enc,size_enc,day_Fri,day_Sat,day_Sun
0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,1,0,0,1
2,1,0,0,1,1,1,0,0,1
3,1,0,0,1,1,0,0,0,1
4,0,0,0,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...
239,1,0,0,1,1,1,0,1,0
240,0,0,1,1,0,0,0,1,0
241,1,0,1,1,0,0,0,1,0
242,1,0,0,0,0,0,0,1,0


In [54]:
# Seperating independent and dependent features
X = df[['sex_encoded', 'time_encoded', 'total_bill_enc',
       'tip_enc', 'size_enc', 'day_Fri', 'day_Sat', 'day_Sun']]
y = df['smoker_encoded']

In [57]:
X

Unnamed: 0,sex_encoded,time_encoded,total_bill_enc,tip_enc,size_enc,day_Fri,day_Sat,day_Sun
0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,1
2,1,0,1,1,1,0,0,1
3,1,0,1,1,0,0,0,1
4,0,0,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...
239,1,0,1,1,1,0,1,0
240,0,0,1,0,0,0,1,0
241,1,0,1,0,0,0,1,0
242,1,0,0,0,0,0,1,0


In [58]:
print(df.dtypes)

total_bill         float64
tip                float64
sex               category
smoker            category
day               category
time              category
size                 int64
sex_encoded          int64
time_encoded         int64
smoker_encoded       int64
total_bill_enc       int64
tip_enc              int64
size_enc             int64
day_Fri              int64
day_Sat              int64
day_Sun              int64
dtype: object


In [59]:
# train test split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=0)

In [60]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=1.0, fit_prior=True, binarize=None)

In [61]:
# Model training
bnb.fit(X_train,y_train)

# Predict the outcomes
y_pred = bnb.predict(X_test)
print(f'The predicted values are as follows : {y_pred}')

The predicted values are as follows : [0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0
 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1]


In [62]:
# Performance metrices
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
print(f"Accuracy score for the model : {accuracy_score(y_test,y_pred)}")
print(f"Classification report : \n{classification_report(y_test,y_pred)}")
print(f"Confusion Matrix : \n{confusion_matrix(y_test,y_pred)}")

Accuracy score for the model : 0.6557377049180327
Classification report : 
              precision    recall  f1-score   support

           0       0.67      0.76      0.71        34
           1       0.64      0.52      0.57        27

    accuracy                           0.66        61
   macro avg       0.65      0.64      0.64        61
weighted avg       0.65      0.66      0.65        61

Confusion Matrix : 
[[26  8]
 [13 14]]
