In [40]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split # data splitting
from sklearn.linear_model import Ridge, LinearRegression # ML model
from sklearn.metrics import mean_squared_error # evaluation
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error, r2_score

In [41]:
tips = sns.load_dataset('tips')

In [42]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [43]:
tips_encode = tips[['sex', 'smoker', 'time']]
tips_encode

Unnamed: 0,sex,smoker,time
0,Female,No,Dinner
1,Male,No,Dinner
2,Male,No,Dinner
3,Male,No,Dinner
4,Female,No,Dinner
...,...,...,...
239,Male,No,Dinner
240,Female,Yes,Dinner
241,Male,Yes,Dinner
242,Male,No,Dinner


In [44]:
tips_encode = pd.get_dummies(tips_encode).astype('int')
tips_encode

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,time_Lunch,time_Dinner
0,0,1,0,1,0,1
1,1,0,0,1,0,1
2,1,0,0,1,0,1
3,1,0,0,1,0,1
4,0,1,0,1,0,1
...,...,...,...,...,...,...
239,1,0,0,1,0,1
240,0,1,1,0,0,1
241,1,0,1,0,0,1
242,1,0,0,1,0,1


In [45]:
tips_decode = tips[['total_bill', 'tip', 'day', 'size']]
tips_decode

Unnamed: 0,total_bill,tip,day,size
0,16.99,1.01,Sun,2
1,10.34,1.66,Sun,3
2,21.01,3.50,Sun,3
3,23.68,3.31,Sun,2
4,24.59,3.61,Sun,4
...,...,...,...,...
239,29.03,5.92,Sat,3
240,27.18,2.00,Sat,2
241,22.67,2.00,Sat,2
242,17.82,1.75,Sat,2


In [47]:
tips = pd.concat([tips_decode, tips_encode], axis = 1)
tips

Unnamed: 0,total_bill,tip,day,size,sex_Male,sex_Female,smoker_Yes,smoker_No,time_Lunch,time_Dinner
0,16.99,1.01,Sun,2,0,1,0,1,0,1
1,10.34,1.66,Sun,3,1,0,0,1,0,1
2,21.01,3.50,Sun,3,1,0,0,1,0,1
3,23.68,3.31,Sun,2,1,0,0,1,0,1
4,24.59,3.61,Sun,4,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Sat,3,1,0,0,1,0,1
240,27.18,2.00,Sat,2,0,1,1,0,0,1
241,22.67,2.00,Sat,2,1,0,1,0,0,1
242,17.82,1.75,Sat,2,1,0,0,1,0,1


In [48]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [11]:
import category_encoders as ce

In [12]:
binary_encoder = ce.BinaryEncoder(cols = ['day'])
df_day = binary_encoder.fit_transform(tips['day'])
df_day

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


In [13]:
tips.drop(columns = 'day', inplace = True)
tips

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,time_Lunch,time_Dinner
0,16.99,1.01,2,False,True,False,True,False,True
1,10.34,1.66,3,True,False,False,True,False,True
2,21.01,3.50,3,True,False,False,True,False,True
3,23.68,3.31,2,True,False,False,True,False,True
4,24.59,3.61,4,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,True,False,False,True,False,True
240,27.18,2.00,2,False,True,True,False,False,True
241,22.67,2.00,2,True,False,True,False,False,True
242,17.82,1.75,2,True,False,False,True,False,True


In [14]:
tips = pd.concat([tips, df_day], axis = 1)

In [15]:
tips = pd.concat([tips, df_day], axis = 1)
tips

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,time_Lunch,time_Dinner,day_0,day_1,day_2,day_0.1,day_1.1,day_2.1
0,16.99,1.01,2,False,True,False,True,False,True,0,0,1,0,0,1
1,10.34,1.66,3,True,False,False,True,False,True,0,0,1,0,0,1
2,21.01,3.50,3,True,False,False,True,False,True,0,0,1,0,0,1
3,23.68,3.31,2,True,False,False,True,False,True,0,0,1,0,0,1
4,24.59,3.61,4,False,True,False,True,False,True,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,True,False,False,True,False,True,0,1,0,0,1,0
240,27.18,2.00,2,False,True,True,False,False,True,0,1,0,0,1,0
241,22.67,2.00,2,True,False,True,False,False,True,0,1,0,0,1,0
242,17.82,1.75,2,True,False,False,True,False,True,0,1,0,0,1,0


In [16]:
robust = RobustScaler()

In [17]:
tips['total_bill'] = robust.fit_transform(tips[['total_bill']])

In [18]:
tips

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,time_Lunch,time_Dinner,day_0,day_1,day_2,day_0.1,day_1.1,day_2.1
0,-0.074675,1.01,2,False,True,False,True,False,True,0,0,1,0,0,1
1,-0.691558,1.66,3,True,False,False,True,False,True,0,0,1,0,0,1
2,0.298237,3.50,3,True,False,False,True,False,True,0,0,1,0,0,1
3,0.545918,3.31,2,True,False,False,True,False,True,0,0,1,0,0,1
4,0.630334,3.61,4,False,True,False,True,False,True,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,1.042208,5.92,3,True,False,False,True,False,True,0,1,0,0,1,0
240,0.870594,2.00,2,False,True,True,False,False,True,0,1,0,0,1,0
241,0.452226,2.00,2,True,False,True,False,False,True,0,1,0,0,1,0
242,0.002319,1.75,2,True,False,False,True,False,True,0,1,0,0,1,0


In [19]:
tips['total_bill'].describe()

count    2.440000e+02
mean     1.846886e-01
std      8.258267e-01
min     -1.365955e+00
25%     -4.125696e-01
50%     -1.647987e-16
75%      5.874304e-01
max      3.062616e+00
Name: total_bill, dtype: float64

In [20]:
X = tips.drop(columns = 'tip')
y = tips['tip']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size = 0.3)

In [22]:
ridge = Ridge()
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_test)
mean_squared_error(y_test, y_pred)

1.0296326585503868

In [23]:
tips_nonproc = sns.load_dataset('tips')
tips_nonproc

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [24]:
X = tips_nonproc[['size']]
y = tips_nonproc['total_bill']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size = 0.3)

In [28]:
# Create and fit the decision tree model
tree_model = DecisionTreeRegressor(max_depth=3)
tree_model.fit(X_train, y_train)

In [29]:
y_pred = tree_model.predict(X_test)

In [30]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 59.76020560034255


In [33]:
model = DecisionTreeRegressor(max_depth=3, random_state=10)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared (R2) Score: {r2}")

Mean Squared Error: 59.76020560034255
R-squared (R2) Score: 0.33200263208067504
