In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [22]:
# Seperating dependent and independent features
X = df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
y= df['total_bill']

In [23]:
# Train tet split the data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [24]:
# Feature encoding
# Feature encoding should be done after the train-test split, to avoid data leakage.
#If you encode before splitting, the encoder sees all data, including the test set. 
#This allows information from the test set to leak into the model training process — especially dangerous with encoders like:
#--LabelEncoder (if reused incorrectly)
#--OneHotEncoder (fit on all categories, including unseen ones)
#--TargetEncoder (uses target values — very leakage-prone)

#Label encoding for binary category
#Onehot encoding for multiple categories

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
oe = OneHotEncoder()

In [25]:
X_train['sex'] = le1.fit_transform(X_train['sex'])
X_test['sex'] = le1.transform(X_test['sex'])
X_train['smoker'] = le2.fit_transform(X_train['smoker'])
X_test['smoker'] = le2.transform(X_test['smoker'])
X_train['time'] = le3.fit_transform(X_train['time'])
X_test['time'] = le3.transform(X_test['time'])

In [26]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
115,3.5,0,0,Sun,0,2
181,5.65,1,1,Sun,0,2
225,2.5,0,1,Fri,1,2
68,2.01,1,0,Sat,0,2
104,4.08,0,0,Sat,0,2


In [27]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
24,3.18,1,0,Sat,0,2
6,2.0,1,0,Sun,0,2
153,2.0,1,0,Sun,0,4
211,5.16,1,1,Sat,0,4
198,2.0,0,1,Thur,1,2


In [31]:
# Onehot encoding using --> column transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('onehot', OneHotEncoder(drop='first'),[3])], remainder='passthrough')

In [32]:
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

In [37]:
feature_names = ct.get_feature_names_out()
X_train = pd.DataFrame(X_train, columns=feature_names)
X_test = pd.DataFrame(X_test, columns=feature_names)

In [None]:
X_train

Unnamed: 0,onehot__day_Sat,onehot__day_Sun,onehot__day_Thur,remainder__tip,remainder__sex,remainder__smoker,remainder__time,remainder__size
0,0.0,1.0,0.0,3.50,0.0,0.0,0.0,2.0
1,0.0,1.0,0.0,5.65,1.0,1.0,0.0,2.0
2,0.0,0.0,0.0,2.50,0.0,1.0,1.0,2.0
3,1.0,0.0,0.0,2.01,1.0,0.0,0.0,2.0
4,1.0,0.0,0.0,4.08,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...
178,1.0,0.0,0.0,4.06,1.0,1.0,0.0,2.0
179,0.0,1.0,0.0,3.02,0.0,0.0,0.0,2.0
180,0.0,0.0,0.0,1.00,0.0,1.0,0.0,2.0
181,0.0,1.0,0.0,3.55,1.0,1.0,0.0,2.0


In [39]:
# Svr model
from sklearn.svm import SVR
svr = SVR()
# Train the model
svr.fit(X_train, y_train)
# Predict values
y_pred = svr.predict(X_test)

In [40]:
# Performance matrix
from sklearn.metrics import r2_score, mean_absolute_error
print(f'R2 score : {r2_score(y_test,y_pred)}')
print(f'Mean Absolute Error : {mean_absolute_error(y_test,y_pred)}')

R2 score : 0.49798620106004743
Mean Absolute Error : 4.463296539661224


In [77]:
# Hyperparameter tuning the model
from sklearn.model_selection import GridSearchCV

# Defining parameter range
param_grid = {
    'kernel': ['rbf'],
    'C': [0.1, 1, 10, 100, 1000],                # Regularization
    'gamma': [ 0.01, 0.1, 1, 0.001, 0.0001],  # Kernel coefficient (for rbf, poly, sigmoid)
}

In [86]:
grid = GridSearchCV(estimator=svr, param_grid=param_grid, n_jobs=-1, refit=True, verbose=3)
#fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [87]:
grid.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [88]:
y_pred_grid = grid.predict(X_test)
print(f"Prediction values : {y_pred_grid}")

Prediction values : [16.71716271 13.20115569 19.94667001 32.16975629 14.13859308 15.17551686
 15.82613821 14.14901871 16.51539426 18.11781849 15.83953891 11.50346124
 11.45508531 15.17551686  9.06542812 14.4585073  23.23218662 18.69348775
 14.72749546 29.76010508 21.70017981 20.77293902 21.49712391 11.4520684
 22.61211314 13.65159866 12.52844121 25.42526341 19.94667001 32.20532061
 22.25394675 13.48466167 20.83481222 18.43664616 20.94950698 21.72561392
 14.3524912  29.45209862 14.82530036 14.88104682 10.95068634 12.71251844
 14.59145342 15.83186201 14.50200428  8.736755   13.91451889 17.3754107
 11.54542284 15.15915748 14.96035849 21.29384674 26.80823409 12.93409478
 17.37232098 12.85943974 24.15756384 12.24677687 18.51314531 21.57544073
 31.76098665]


In [89]:
# Performance matrix
print(f'R2 score : {r2_score(y_test,y_pred_grid)}')
print(f'Mean Absolute Error : {mean_absolute_error(y_test,y_pred_grid)}')

R2 score : 0.5630727637616448
Mean Absolute Error : 4.26048397387227


In [90]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scaler = StandardScaler()
pipe = Pipeline([
    ('Scaler',scaler),
    ('svr',svr) 
])

In [91]:
pipe.fit(X_train, y_train)

In [92]:
y_pred_pipe = pipe.predict(X_test)

In [94]:
# Performance matrix
print(f'R2 score : {r2_score(y_test,y_pred_pipe)}')
print(f'Mean Absolute Error : {mean_absolute_error(y_test,y_pred_pipe)}')

R2 score : 0.36015700427347297
Mean Absolute Error : 4.9746606744212


In [95]:
grid_pipe = GridSearchCV(estimator=svr, param_grid=param_grid, n_jobs=-1, refit=True, verbose=3)
#fitting the model for grid search
grid_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [96]:
grid_pipe.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [97]:
grid_pipe_pred = grid_pipe.predict(X_test)

In [None]:
print(f'R2 score : {r2_score(y_test,grid_pipe_pred)}')
print(f'Mean Absolute Error : {mean_absolute_error(y_test,grid_pipe_pred)}')

#Now, the model explains 56.3% of the variance in the target variable.
#It indicates that the model is doing a moderate job at explaining the data, but there’s still room for improvement.
#The average prediction error has dropped from 6.27 to 4.97 to 4.26.

R2 score : 0.5630727637616448
Mean Absolute Error : 4.26048397387227
