# Implementation Support Vector Machine For Regression Problem

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import Dataset

In [2]:
# Dataset (tips dataset)
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


* Our Goal it's about build a model that predict `total_bill` column

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [5]:
df['size'].value_counts()

Unnamed: 0_level_0,count
size,Unnamed: 1_level_1
2,156
3,38
4,37
5,5
1,4
6,4


In [6]:
df['sex'].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
Male,157
Female,87


In [7]:
df['time'].value_counts()

Unnamed: 0_level_0,count
time,Unnamed: 1_level_1
Dinner,176
Lunch,68


In [8]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

## Split Data into training and test

In [8]:
X=df.drop('total_bill', axis=1)
y=df['total_bill']

In [9]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y, test_size=0.3, random_state=101)

## Feature Encoding

* convert all category into a numerical value use One Hot Encoding Methode.

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


### Convert Training into binary Value by Using `LabelEncoder()`

In [12]:
# we use LableEncoder to convert our category into binary values (0 and 1)
le1=LabelEncoder()
le2=LabelEncoder()
le3=LabelEncoder()

In [13]:
#  X_train
# we don't convert day because he doesn't have a binary classification
X_train['sex']=le1.fit_transform(X_train['sex'])
X_train['smoker']=le2.fit_transform(X_train['smoker'])
X_train['time']=le3.fit_transform(X_train['time'])

In [14]:
X_train['time'].value_counts()

Unnamed: 0_level_0,count
time,Unnamed: 1_level_1
0,120
1,50


In [15]:
X_train['sex'].value_counts()

Unnamed: 0_level_0,count
sex,Unnamed: 1_level_1
1,107
0,63


In [16]:
X_train['smoker'].value_counts()

Unnamed: 0_level_0,count
smoker,Unnamed: 1_level_1
0,102
1,68


### Convert Test Data into binary Values

In [17]:
le1=LabelEncoder()
le2=LabelEncoder()
le3=LabelEncoder()
X_test['sex']=le1.fit_transform(X_test['sex'])
X_test['smoker']=le2.fit_transform(X_test['smoker'])
X_test['time']=le3.fit_transform(X_test['time'])

In [18]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
151,2.0,1,0,Sun,0,2
34,3.27,1,0,Sat,0,2
109,4.0,0,1,Sat,0,2
4,3.61,0,0,Sun,0,4
114,4.0,0,0,Sun,0,3


### Usin OneHotEncoding For Day Feature

In [19]:
ct=ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [3])], remainder='passthrough')

In [20]:
X_train =ct.fit_transform(X_train)

In [21]:
X_test = ct.transform(X_test)

In [22]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,1.0,2.50,0.0,1.0,1.0,2.0
1,1.0,0.0,0.0,3.16,1.0,1.0,0.0,2.0
2,0.0,1.0,0.0,5.00,1.0,0.0,0.0,5.0
3,1.0,0.0,0.0,2.00,0.0,1.0,0.0,2.0
4,0.0,1.0,0.0,1.67,0.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...
165,1.0,0.0,0.0,3.76,1.0,1.0,0.0,4.0
166,1.0,0.0,0.0,1.97,1.0,0.0,0.0,2.0
167,0.0,0.0,1.0,3.40,1.0,0.0,1.0,2.0
168,0.0,1.0,0.0,5.00,0.0,0.0,0.0,4.0


## Modeling

In [23]:
# Import SVR algorithms
from sklearn.svm import SVR

In [24]:
svr = SVR(kernel='linear')
svr.fit(X_train, y_train)

In [25]:
y_pred = svr.predict(X_test)

In [26]:
# import metrics for evaluate
from sklearn.metrics import r2_score , mean_absolute_error , mean_squared_error
print(f'R2 Score : {r2_score(y_test, y_pred)}')
print(f'MAE : {mean_absolute_error(y_test, y_pred)}')
print(f'MSE : {mean_squared_error(y_test, y_pred)}')


R2 Score : 0.3857879661116195
MAE : 4.810027126200832
MSE : 49.39104232031621


## HyperParameters Tuning

In [27]:
# hyperParameter Using GridSearchCV
from sklearn.model_selection import GridSearchCV
# definig  parameter range
param_grid = {'C': [0.1,1,10,100,1000],
              'gamma': [1,0.1,0.01,0.001,0.0001],
              'kernel': ['rbf']
             }


In [28]:
grid=GridSearchCV(SVR(), param_grid, refit=True, verbose=3,cv=5)

In [29]:
# Fitting model into training
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.027 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.176 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.038 total time=   0.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.028 total time=   0.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.020 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.092 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.049 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.151 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.082 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.121 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.031 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.01, kernel=rbf;

In [30]:
grid.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [31]:
grid_predictions = grid.predict(X_test)


In [32]:
# evaluation Metrics
print(f'R2 Score : {r2_score(y_test, grid_predictions)}')
print(f'MAE : {mean_absolute_error(y_test, grid_predictions)}')
print(f'MSE : {mean_squared_error(y_test, grid_predictions)}')

R2 Score : 0.33610936726863105
MAE : 5.023306775179867
MSE : 53.38588065380604


In [33]:
# fit model with best params
svr = SVR(kernel='rbf', C=1000, gamma=0.01)
svr.fit(X_train, y_train)

In [34]:
y_preds_best = svr.predict(X_test)

In [35]:
# evaluation metrics
print(f'R2 Score : {r2_score(y_test, y_preds_best)}')
print(f'MAE : {mean_absolute_error(y_test, y_preds_best)}')
print(f'MSE : {mean_squared_error(y_test, y_preds_best)}')

R2 Score : 0.3322032717721556
MAE : 4.991869181470223
MSE : 53.69998411861812
