In [80]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import Ridge, LinearRegression 
from sklearn.metrics import mean_squared_error 
from sklearn.tree import DecisionTreeRegressor

import category_encoders as ce

In [81]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [82]:
tips_encode = tips[['sex', 'smoker']]
tips_encode

Unnamed: 0,sex,smoker
0,Female,No
1,Male,No
2,Male,No
3,Male,No
4,Female,No
...,...,...
239,Male,No
240,Female,Yes
241,Male,Yes
242,Male,No


In [83]:
tips_encode = pd.get_dummies(tips_encode).astype('int')
tips_encode

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No
0,0,1,0,1
1,1,0,0,1
2,1,0,0,1
3,1,0,0,1
4,0,1,0,1
...,...,...,...,...
239,1,0,0,1
240,0,1,1,0
241,1,0,1,0
242,1,0,0,1


In [84]:
tips_not_encode = tips[['total_bill', 'tip', 'day', 'size', 'time']]
tips_not_encode

Unnamed: 0,total_bill,tip,day,size,time
0,16.99,1.01,Sun,2,Dinner
1,10.34,1.66,Sun,3,Dinner
2,21.01,3.50,Sun,3,Dinner
3,23.68,3.31,Sun,2,Dinner
4,24.59,3.61,Sun,4,Dinner
...,...,...,...,...,...
239,29.03,5.92,Sat,3,Dinner
240,27.18,2.00,Sat,2,Dinner
241,22.67,2.00,Sat,2,Dinner
242,17.82,1.75,Sat,2,Dinner


In [85]:
tips = pd.concat([tips_not_encode, tips_encode], axis = 1)
tips

Unnamed: 0,total_bill,tip,day,size,time,sex_Male,sex_Female,smoker_Yes,smoker_No
0,16.99,1.01,Sun,2,Dinner,0,1,0,1
1,10.34,1.66,Sun,3,Dinner,1,0,0,1
2,21.01,3.50,Sun,3,Dinner,1,0,0,1
3,23.68,3.31,Sun,2,Dinner,1,0,0,1
4,24.59,3.61,Sun,4,Dinner,0,1,0,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Sat,3,Dinner,1,0,0,1
240,27.18,2.00,Sat,2,Dinner,0,1,1,0
241,22.67,2.00,Sat,2,Dinner,1,0,1,0
242,17.82,1.75,Sat,2,Dinner,1,0,0,1


In [86]:
ordinary_encoder = ce.OrdinalEncoder(cols=['time', 'day'])
df_day = binary_encoder.fit_transform(tips[['time', 'day']])
df_day

Unnamed: 0,time_0,time_1,day_0,day_1,day_2
0,0,1,0,0,1
1,0,1,0,0,1
2,0,1,0,0,1
3,0,1,0,0,1
4,0,1,0,0,1
...,...,...,...,...,...
239,0,1,0,1,0
240,0,1,0,1,0
241,0,1,0,1,0
242,0,1,0,1,0


In [87]:
tips.drop(columns = 'day', inplace =True)
tips

Unnamed: 0,total_bill,tip,size,time,sex_Male,sex_Female,smoker_Yes,smoker_No
0,16.99,1.01,2,Dinner,0,1,0,1
1,10.34,1.66,3,Dinner,1,0,0,1
2,21.01,3.50,3,Dinner,1,0,0,1
3,23.68,3.31,2,Dinner,1,0,0,1
4,24.59,3.61,4,Dinner,0,1,0,1
...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,Dinner,1,0,0,1
240,27.18,2.00,2,Dinner,0,1,1,0
241,22.67,2.00,2,Dinner,1,0,1,0
242,17.82,1.75,2,Dinner,1,0,0,1


In [88]:
tips.drop(columns = 'time', inplace =True)
tips

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No
0,16.99,1.01,2,0,1,0,1
1,10.34,1.66,3,1,0,0,1
2,21.01,3.50,3,1,0,0,1
3,23.68,3.31,2,1,0,0,1
4,24.59,3.61,4,0,1,0,1
...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,0,1
240,27.18,2.00,2,0,1,1,0
241,22.67,2.00,2,1,0,1,0
242,17.82,1.75,2,1,0,0,1


In [89]:
tips = pd.concat([tips, df_day], axis = 1)
tips

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,time_0,time_1,day_0,day_1,day_2
0,16.99,1.01,2,0,1,0,1,0,1,0,0,1
1,10.34,1.66,3,1,0,0,1,0,1,0,0,1
2,21.01,3.50,3,1,0,0,1,0,1,0,0,1
3,23.68,3.31,2,1,0,0,1,0,1,0,0,1
4,24.59,3.61,4,0,1,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,0,1,0,1,0,1,0
240,27.18,2.00,2,0,1,1,0,0,1,0,1,0
241,22.67,2.00,2,1,0,1,0,0,1,0,1,0
242,17.82,1.75,2,1,0,0,1,0,1,0,1,0


In [90]:
X = tips.drop(columns = 'tip')
y = tips['tip']

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size = 0.3)

In [92]:
Tree = DecisionTreeRegressor(max_depth=3)
Tree.fit(X_train, y_train)

y_pred = Tree.predict(X_test)
mean_squared_error(y_test, y_pred)

0.8292584375206998