In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=sns.load_dataset("tips")

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df["sex"].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [5]:
df["smoker"].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [6]:
df["day"].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [7]:
df["time"].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [9]:
# independent and dependent features
X=df.iloc[:,1:]
y=df.iloc[:,0]

In [10]:
X.head()

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.5,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4


In [11]:
y.head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64

In [12]:
# train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [13]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
115,3.5,Female,No,Sun,Dinner,2
181,5.65,Male,Yes,Sun,Dinner,2
225,2.5,Female,Yes,Fri,Lunch,2
68,2.01,Male,No,Sat,Dinner,2
104,4.08,Female,No,Sat,Dinner,2


In [14]:
# label encoding on binary features
from sklearn.preprocessing import LabelEncoder

le1=LabelEncoder()
le2=LabelEncoder()
le3=LabelEncoder()

X_train["sex"]=le1.fit_transform(X_train["sex"])
X_train["smoker"]=le2.fit_transform(X_train["smoker"])
X_train["time"]=le3.fit_transform(X_train["time"])

In [15]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
115,3.5,0,0,Sun,0,2
181,5.65,1,1,Sun,0,2
225,2.5,0,1,Fri,1,2
68,2.01,1,0,Sat,0,2
104,4.08,0,0,Sat,0,2


In [16]:
# on test data as well
X_test["sex"]=le1.transform(X_test["sex"])
X_test["smoker"]=le2.transform(X_test["smoker"])
X_test["time"]=le3.transform(X_test["time"])

In [17]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
24,3.18,1,0,Sat,0,2
6,2.0,1,0,Sun,0,2
153,2.0,1,0,Sun,0,4
211,5.16,1,1,Sat,0,4
198,2.0,0,1,Thur,1,2


In [18]:
# One hot encoding on "Day" feature
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct=ColumnTransformer([("onehot",OneHotEncoder(drop="first"),[3]),
                      ("scale",StandardScaler(),[0,5])],remainder="passthrough")

encoded_df=ct.fit_transform(X_train)

In [19]:
X_train=pd.DataFrame(encoded_df,columns=ct.get_feature_names_out())

In [20]:
X_train.head()

Unnamed: 0,onehot__day_Sat,onehot__day_Sun,onehot__day_Thur,scale__tip,scale__size,remainder__sex,remainder__smoker,remainder__time
0,0.0,1.0,0.0,0.292101,-0.612185,0.0,0.0,0.0
1,0.0,1.0,0.0,1.785634,-0.612185,1.0,1.0,0.0
2,0.0,0.0,0.0,-0.402565,-0.612185,0.0,1.0,1.0
3,1.0,0.0,0.0,-0.742951,-0.612185,1.0,0.0,0.0
4,1.0,0.0,0.0,0.695008,-0.612185,0.0,0.0,0.0


In [21]:
# similarly for test data
encoded_test=ct.transform(X_test)
X_test=pd.DataFrame(encoded_test,columns=ct.get_feature_names_out())
X_test.head()

Unnamed: 0,onehot__day_Sat,onehot__day_Sun,onehot__day_Thur,scale__tip,scale__size,remainder__sex,remainder__smoker,remainder__time
0,1.0,0.0,0.0,0.069808,-0.612185,1.0,0.0,0.0
1,0.0,1.0,0.0,-0.749898,-0.612185,1.0,0.0,0.0
2,0.0,1.0,0.0,-0.749898,1.521717,1.0,0.0,0.0
3,1.0,0.0,0.0,1.445247,1.521717,1.0,1.0,0.0
4,0.0,0.0,1.0,-0.749898,-0.612185,0.0,1.0,1.0


In [22]:
# apply SVR model
from sklearn.svm import SVR

svr=SVR(kernel="linear")
svr.fit(X_train,y_train)
y_pred=svr.predict(X_test)
print(y_pred)

[17.40006905 13.15788635 19.66967858 31.51988409 14.14176821 14.60027069
 15.98959515 14.31864246 16.9880557  18.36899211 16.27184517 12.07519946
 11.14240446 14.60027069  8.91411476 13.96234634 23.72959197 19.13668729
 15.08514828 31.91295869 21.53466134 21.18533817 21.29188962 12.01292029
 22.42309503 13.76872111 12.25483829 24.40584844 19.66967858 35.52334728
 22.64170011 13.42416504 21.29613587 18.93587215 20.9807107  22.07601822
 13.80206388 29.99543691 15.03485299 15.16798198 11.39012852 12.69938387
 14.8705637  15.28216605 14.24577821  8.69411704 13.84952803 17.94682099
 11.05957694 15.96764799 15.26140074 21.58696094 27.45457564 12.89681264
 17.85996417 12.80339388 24.40924098 12.94710793 18.71811199 21.53783692
 32.59939539]


In [25]:
# metrics
from sklearn.metrics import mean_absolute_error,r2_score

mae=mean_absolute_error(y_pred=y_pred,y_true=y_test)
r2_sc=r2_score(y_pred=y_pred,y_true=y_test)
print(mae)
print(r2_sc)

4.112940834228527
0.6055051645448506


In [26]:
# lets try hyperparameter tuning
from sklearn.model_selection import GridSearchCV

model=SVR()

kernel=['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
gamma=[10,5,1,0.1,0.01,0.001]
c_vals=[100,50,10,5,1,0.1]

params=dict(kernel=kernel,gamma=gamma,C=c_vals)

In [27]:
grid=GridSearchCV(estimator=model,param_grid=params,cv=5,scoring="r2")

In [None]:
grid.fit(X_train,y_train)
y_pred2=grid.predict(X_test)