### Implementing Naive Baye' Algorithm on the Tips Dataset

In [1]:
# importing basic libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# importing our dataset

df=sns.load_dataset('tips')

In [3]:
# exploring our dataset

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.shape

(244, 7)

We will be predicting the amount of tip from the given data.

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [6]:
# feature engineering ---> sex

from sklearn.preprocessing import LabelEncoder
le1=LabelEncoder()
sex=le1.fit_transform(df['sex'])
df.drop('sex', axis=1, inplace=True)
df['sex']=sex
df.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,sex
0,16.99,1.01,No,Sun,Dinner,2,0
1,10.34,1.66,No,Sun,Dinner,3,1
2,21.01,3.5,No,Sun,Dinner,3,1
3,23.68,3.31,No,Sun,Dinner,2,1
4,24.59,3.61,No,Sun,Dinner,4,0


In [7]:
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [8]:
# feature engineering ---> smoker

le2=LabelEncoder()
smoker=le2.fit_transform(df['smoker'])
df.drop('smoker', axis=1, inplace=True)
df['smoker']=smoker
df.head()

Unnamed: 0,total_bill,tip,day,time,size,sex,smoker
0,16.99,1.01,Sun,Dinner,2,0,0
1,10.34,1.66,Sun,Dinner,3,1,0
2,21.01,3.5,Sun,Dinner,3,1,0
3,23.68,3.31,Sun,Dinner,2,1,0
4,24.59,3.61,Sun,Dinner,4,0,0


In [9]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [10]:
# feature engineering ---> time

le3=LabelEncoder()
time=le2.fit_transform(df['time'])
df.drop('time', axis=1, inplace=True)
df['time']=time
df.head()

Unnamed: 0,total_bill,tip,day,size,sex,smoker,time
0,16.99,1.01,Sun,2,0,0,0
1,10.34,1.66,Sun,3,1,0,0
2,21.01,3.5,Sun,3,1,0,0
3,23.68,3.31,Sun,2,1,0,0
4,24.59,3.61,Sun,4,0,0,0


In [11]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [12]:
day=pd.get_dummies(df['day'])
type(day)

pandas.core.frame.DataFrame

In [13]:
df=pd.concat([df, day], axis=1)
df.head()

Unnamed: 0,total_bill,tip,day,size,sex,smoker,time,Thur,Fri,Sat,Sun
0,16.99,1.01,Sun,2,0,0,0,False,False,False,True
1,10.34,1.66,Sun,3,1,0,0,False,False,False,True
2,21.01,3.5,Sun,3,1,0,0,False,False,False,True
3,23.68,3.31,Sun,2,1,0,0,False,False,False,True
4,24.59,3.61,Sun,4,0,0,0,False,False,False,True


In [14]:
le4=LabelEncoder()
le5=LabelEncoder()
le6=LabelEncoder()
le7=LabelEncoder()
Thur=le4.fit_transform(df['Thur'])
Fri=le5.fit_transform(df['Fri'])
Sat=le6.fit_transform(df['Sat'])
Sun=le7.fit_transform(df['Sun'])
df.drop(columns=['day','Thur', 'Fri', 'Sat', 'Sun'], axis=1, inplace=True)
df['Thur']=Thur
df['Fri']=Fri
df['Sat']=Sat
df['Sun']=Sun
df.head()

Unnamed: 0,total_bill,tip,size,sex,smoker,time,Thur,Fri,Sat,Sun
0,16.99,1.01,2,0,0,0,0,0,0,1
1,10.34,1.66,3,1,0,0,0,0,0,1
2,21.01,3.5,3,1,0,0,0,0,0,1
3,23.68,3.31,2,1,0,0,0,0,0,1
4,24.59,3.61,4,0,0,0,0,0,0,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   size        244 non-null    int64  
 3   sex         244 non-null    int64  
 4   smoker      244 non-null    int64  
 5   time        244 non-null    int64  
 6   Thur        244 non-null    int64  
 7   Fri         244 non-null    int64  
 8   Sat         244 non-null    int64  
 9   Sun         244 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 19.2 KB


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   size        244 non-null    int64  
 3   sex         244 non-null    int64  
 4   smoker      244 non-null    int64  
 5   time        244 non-null    int64  
 6   Thur        244 non-null    int64  
 7   Fri         244 non-null    int64  
 8   Sat         244 non-null    int64  
 9   Sun         244 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 19.2 KB


In [25]:
# creating our ML model

from sklearn.linear_model import LinearRegression
linreg=LinearRegression()

In [21]:
# splitting our data into training and testing sets

X=df.drop(columns=['tip'], axis=1)
y=df['tip']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
X_train.shape

(195, 9)

In [23]:
X_test.shape

(49, 9)

In [26]:
# fitting our model to the training data

linreg.fit(X_train, y_train)

In [27]:
# making predictions

y_preds=linreg.predict(X_test)

In [28]:
# scoring our model

print(y_test)
print(y_preds)

24     3.18
6      2.00
153    2.00
211    5.16
198    2.00
176    2.00
192    2.56
124    2.52
9      3.23
101    3.00
45     3.00
233    1.47
117    1.50
177    2.00
82     1.83
146    1.36
200    4.00
15     3.92
66     2.47
142    5.00
33     2.45
19     3.35
109    4.00
30     1.45
186    3.50
120    2.31
10     1.71
73     5.00
159    2.00
156    5.00
112    4.00
218    1.44
25     2.34
60     3.21
18     3.50
119    2.92
97     1.50
197    5.00
139    2.75
241    2.00
75     1.25
127    2.00
113    2.55
16     1.67
196    2.00
67     1.00
168    1.61
38     2.31
195    1.44
Name: tip, dtype: float64
[2.91436632 2.00292613 3.96425583 3.76380832 2.14836306 2.67423448
 3.63934628 2.29147245 2.57207155 2.45851225 2.90446763 2.0573337
 2.11817193 2.35130838 1.82976215 3.10830675 2.95140176 3.21602976
 2.56640892 5.73957295 3.43490366 3.22645102 2.17139823 1.94180002
 3.16394533 2.24547894 2.14497574 3.21025435 3.20097595 6.66803147
 5.01111235 1.57804024 3.1909877  2.76652194 2.98412