# Import necessary libraries¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the data

In [2]:
data = pd.read_csv(r"C:\Users\athir\data_sets\ML\Assg_community\tips.csv")
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
data.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [4]:
data = data[['total_bill', 'sex', 'smoker', 'day', 'time', 'size','tip']]
data.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip
0,16.99,Female,No,Sun,Dinner,2,1.01
1,10.34,Male,No,Sun,Dinner,3,1.66
2,21.01,Male,No,Sun,Dinner,3,3.5
3,23.68,Male,No,Sun,Dinner,2,3.31
4,24.59,Female,No,Sun,Dinner,4,3.61


### data cleaning, data wrangling, data preprocessing

In [5]:
data.isnull().sum()

total_bill    0
sex           0
smoker        0
day           0
time          0
size          0
tip           0
dtype: int64

# Create a Numerical pipeline for preprocessing all the numerical variables present in the data.

## step 1: Import all the libraries and functins needed for data preprocessing.

In [6]:
# missing value treatment

from sklearn.impute import SimpleImputer
sim_imp = SimpleImputer(strategy = 'median')
sim_imp

In [7]:
# feature scaling

from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

## step 2: Mention the steps needed to perform automation and store it in the 'steps' variable.

In [8]:
num_steps = [('Missing value treatment',sim_imp),('Feature scaling',mms)]
num_steps

[('Missing value treatment', SimpleImputer(strategy='median')),
 ('Feature scaling', MinMaxScaler())]

## step 3: Import pipeline function from sklearn and create pipeline with the "steps" variable.

In [9]:
from sklearn.pipeline import Pipeline
num_pipe = Pipeline(num_steps)
num_pipe

In [10]:
data[['total_bill','size']] = num_pipe.fit_transform(data[['total_bill','size']])
data.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip
0,0.291579,Female,No,Sun,Dinner,0.2,1.01
1,0.152283,Male,No,Sun,Dinner,0.4,1.66
2,0.375786,Male,No,Sun,Dinner,0.4,3.5
3,0.431713,Male,No,Sun,Dinner,0.2,3.31
4,0.450775,Female,No,Sun,Dinner,0.6,3.61


In [11]:
dic = {'No':0,'Yes':1}
data['smoker'] = data['smoker'].replace(dic)
data.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip
0,0.291579,Female,0,Sun,Dinner,0.2,1.01
1,0.152283,Male,0,Sun,Dinner,0.4,1.66
2,0.375786,Male,0,Sun,Dinner,0.4,3.5
3,0.431713,Male,0,Sun,Dinner,0.2,3.31
4,0.450775,Female,0,Sun,Dinner,0.6,3.61


# Create a Categorical pipeline for preprocessing all the categorical variables in the data.

## step 1: Import all the libraries and functins needed for data preprocessing.

In [12]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy = 'most_frequent')
si

In [13]:
# Feature encoding

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe

## step 2: Mention the steps needed to perform automation and store it in the 'steps' variable.

In [14]:
cat_steps = [('Missing value treatment',si),('Feature encoding',ohe)]
cat_steps

[('Missing value treatment', SimpleImputer(strategy='most_frequent')),
 ('Feature encoding', OneHotEncoder())]

## step 3: Import pipeline function from sklearn and create pipeline with the "steps" variable.

In [15]:
from sklearn.pipeline import Pipeline
cat_pipe = Pipeline(cat_steps)
cat_pipe

In [16]:
data_ohe = cat_pipe.fit_transform(data[['sex']])
data_ohe

<244x2 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

In [17]:
data_ohe.toarray()

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.

In [18]:
data_ohe = pd.DataFrame(data_ohe.toarray(),columns = ['Female','Male'])
data_ohe

Unnamed: 0,Female,Male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0
...,...,...
239,0.0,1.0
240,1.0,0.0
241,0.0,1.0
242,0.0,1.0


In [19]:
data1_ohe = cat_pipe.fit_transform(data[['day']])
data1_ohe


<244x4 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

In [20]:
data1_ohe.toarray()

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [21]:
data

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip
0,0.291579,Female,0,Sun,Dinner,0.2,1.01
1,0.152283,Male,0,Sun,Dinner,0.4,1.66
2,0.375786,Male,0,Sun,Dinner,0.4,3.50
3,0.431713,Male,0,Sun,Dinner,0.2,3.31
4,0.450775,Female,0,Sun,Dinner,0.6,3.61
...,...,...,...,...,...,...,...
239,0.543779,Male,0,Sat,Dinner,0.4,5.92
240,0.505027,Female,1,Sat,Dinner,0.2,2.00
241,0.410557,Male,1,Sat,Dinner,0.2,2.00
242,0.308965,Male,0,Sat,Dinner,0.2,1.75


In [22]:
data['day'].unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [23]:
data1_ohe = pd.DataFrame(data1_ohe.toarray(), columns = ['Fri','Sat','Sun','Thurs'])
data1_ohe

Unnamed: 0,Fri,Sat,Sun,Thurs
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
239,0.0,1.0,0.0,0.0
240,0.0,1.0,0.0,0.0
241,0.0,1.0,0.0,0.0
242,0.0,1.0,0.0,0.0


In [24]:
data2_ohe = cat_pipe.fit_transform(data[['time']])
data2_ohe

<244x2 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

In [25]:
data2_ohe.toarray()

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.

In [26]:
data2_ohe = pd.DataFrame(data2_ohe.toarray(), columns = ['Dinner','Lunch'])
data2_ohe


Unnamed: 0,Dinner,Lunch
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
239,1.0,0.0
240,1.0,0.0
241,1.0,0.0
242,1.0,0.0


In [27]:
data = pd.concat([data,data_ohe,data1_ohe,data2_ohe], axis =1)
data.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tip,Female,Male,Fri,Sat,Sun,Thurs,Dinner,Lunch
0,0.291579,Female,0,Sun,Dinner,0.2,1.01,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.152283,Male,0,Sun,Dinner,0.4,1.66,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.375786,Male,0,Sun,Dinner,0.4,3.5,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.431713,Male,0,Sun,Dinner,0.2,3.31,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.450775,Female,0,Sun,Dinner,0.6,3.61,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [28]:
data = data.drop(['sex','day','time'],axis=1)
data.head()

Unnamed: 0,total_bill,smoker,size,tip,Female,Male,Fri,Sat,Sun,Thurs,Dinner,Lunch
0,0.291579,0,0.2,1.01,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.152283,0,0.4,1.66,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.375786,0,0.4,3.5,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.431713,0,0.2,3.31,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.450775,0,0.6,3.61,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


# separate the data into x and y

In [29]:
x = data.drop('tip',axis=1)
y = data['tip']

# split the data

In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

# Perform 10 Fold Cross Validation on the data

In [31]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 10)
kf

KFold(n_splits=10, random_state=None, shuffle=False)

In [32]:
kf.split(x,y)

<generator object _BaseKFold.split at 0x00000215522700B0>

In [33]:
list(kf.split(x,y))

[(array([ 25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
          38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
          51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
          64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
          77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
          90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
         103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
         116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
         129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
         142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
         155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
         168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
         181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
         194, 195, 196, 197, 198, 199,

# Determine whether Linear Regression or KNeighborsRegressor suits better on the data.

## Apply linear regression on the data

In [34]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [35]:
lr.fit(x_train,y_train)

In [36]:
y_pred_lr = lr.predict(x_test)
y_pred_lr

array([2.75976768, 2.99712626, 2.81673204, 1.4632311 , 3.12654885,
       1.77482578, 2.54679105, 3.11359077, 2.88681115, 4.50140326,
       3.18428659, 3.17091975, 2.3527022 , 2.29224593, 2.87517409,
       4.16944264, 1.92603957, 2.24308317, 2.3679374 , 3.30322842,
       3.79547836, 2.84290839, 2.5417152 , 2.43574813, 2.32853543,
       2.58333477, 2.76729999, 4.59978345, 3.73195325, 2.40199442,
       2.31295548, 2.27962689, 2.46828159, 1.92661159, 2.71399481,
       2.30185826, 2.71063603, 2.09555601, 5.60088215, 3.34295006,
       2.25147162, 2.27534617, 2.56519997, 4.33905752, 2.11239476,
       2.81768662, 2.60608807, 2.99566606, 2.72262204])

In [37]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred_lr)

0.5545031052534936

## Apply KNeighborsRegressor on the data

In [38]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr

In [39]:
knr.fit(x_train,y_train)

In [40]:
y_pred_knr = knr.predict(x_test)
y_pred_knr

array([2.61 , 3.356, 3.218, 2.544, 2.956, 3.41 , 3.014, 2.536, 3.3  ,
       3.732, 3.6  , 2.494, 2.558, 3.286, 2.6  , 4.606, 1.576, 2.416,
       3.216, 3.2  , 2.754, 2.544, 3.004, 2.558, 3.004, 3.286, 2.2  ,
       3.456, 3.606, 2.544, 2.544, 2.012, 2.566, 2.496, 3.202, 2.426,
       2.734, 1.576, 5.766, 2.228, 2.2  , 2.012, 2.378, 4.312, 2.496,
       2.436, 2.858, 3.22 , 2.734])

In [41]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred_knr)

0.4565110890972297

- Linear regression is performing better on this data with 55.45% of accuracy.

# Apply 10 Fold Cross Validation and find out the best accuracy using linear Regression Algorithm

In [42]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [43]:
from sklearn.model_selection import cross_val_score
cross_val_score(lr,x,y,cv=10)

array([ 0.57370317,  0.48785305,  0.64562012,  0.47289651,  0.09956561,
        0.64694796,  0.52541767, -0.80599458,  0.53212589,  0.05037821])

- 6th train-test set is giving the best accuracy of 64.69%.

# Apply 10 Fold Cross Validation and find out the best accuracy using K Neighbors Regression Algorithm.

In [44]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr

In [45]:
from sklearn.model_selection import cross_val_score
cross_val_score(knr,x,y,cv=10)

array([ 0.6283269 ,  0.40282969,  0.74023892, -0.00532146, -0.00730875,
        0.28413942, -0.03697643, -0.49128019,  0.5030174 , -0.15520576])

- 3rd train-test set is giving the best accuracy of 74.02%.