# Support Vector Regression (SVR)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
from google.colab import files

uploaded = files.upload()

Saving Position_Salaries.csv to Position_Salaries.csv


In [32]:
import io

dataset = pd.read_csv(io.BytesIO(uploaded['Position_Salaries.csv']), delimiter=",")
dataset.head(10)

Unnamed: 0,Position,Level,Salary
0,Business Analyst,1,43000
1,Business Analyst,1,45000
2,Business Analyst,1,46000
3,Junior Consultant,2,50000
4,Junior Consultant,2,53000
5,Junior Consultant,2,55000
6,Senior Consultant,3,60000
7,Senior Consultant,3,63000
8,Senior Consultant,3,65000
9,Manager,4,80000


In [33]:
# dataset = pd.read_csv('Position_Salaries.csv')

# Extracting features (independent variables) from the dataset
# Use iloc to select all rows and all columns except the last one (index -1)
X = dataset.iloc[:, 1:-1].values

# Extracting the target variable (dependent variable) from the dataset
# Use iloc to select all rows and the last column only
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [34]:
# Import the necessary library for splitting the dataset
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
# X_train: Training set features
# X_test: Testing set features
# y_train: Training set target variable
# y_test: Testing set target variable
# test_size=0.20: 20% of the data will be used for testing, and 80% for training
# random_state=0: Setting a random seed for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

## Feature Scaling

### X_train and X_test Scaling Process


In [35]:
# (Before Scaling) X_train
print(X_train)

[[10]
 [ 4]
 [ 6]
 [ 8]
 [ 2]
 [ 6]
 [ 3]
 [ 5]
 [ 8]
 [ 7]
 [ 1]
 [10]
 [ 3]
 [ 2]
 [ 7]
 [ 7]
 [ 4]
 [ 3]
 [ 9]
 [ 2]
 [ 1]
 [ 8]
 [ 6]
 [ 5]]


In [36]:
# (Before Scaling) X_test
print(X_test)

[[ 1]
 [10]
 [ 5]
 [ 4]
 [ 9]
 [ 9]]


In [37]:
# Import the necessary library for feature scaling
from sklearn.preprocessing import StandardScaler

# Create an instance of the StandardScaler
ss_scaler_X = StandardScaler()

# Scale (standardize) the features in the training set
# fit_transform is used to compute the mean and standard deviation
# and then apply the transformation to the data
X_train = ss_scaler_X.fit_transform(X_train)

# Scale the features in the testing set using the parameters learned from the training set
# transform is used here to apply the same transformation to the testing set
X_test = ss_scaler_X.transform(X_test)

In [38]:
# (After Scaling) X_train
print(X_train)

[[ 1.7242366 ]
 [-0.47302066]
 [ 0.25939843]
 [ 0.99181751]
 [-1.20543975]
 [ 0.25939843]
 [-0.8392302 ]
 [-0.10681112]
 [ 0.99181751]
 [ 0.62560797]
 [-1.57164929]
 [ 1.7242366 ]
 [-0.8392302 ]
 [-1.20543975]
 [ 0.62560797]
 [ 0.62560797]
 [-0.47302066]
 [-0.8392302 ]
 [ 1.35802706]
 [-1.20543975]
 [-1.57164929]
 [ 0.99181751]
 [ 0.25939843]
 [-0.10681112]]


In [39]:
# (After Scaling) X_test
print(X_test)

[[-1.57164929]
 [ 1.7242366 ]
 [-0.10681112]
 [-0.47302066]
 [ 1.35802706]
 [ 1.35802706]]


### Scaling **y_train**



In [40]:
# (Before Scaling) X_train
print(y_train)

[1000000   85000  155000  330000   55000  152000   65000  115000  350000
  250000   45000 1050000   60000   53000  200000  230000   80000   63000
  520000   50000   43000  300000  150000  110000]


The code block below performs a **data preprocessing** by first **standardizing** the features in the dataset.

While scaling **y_train**, a StandardScaler() object named ss_scaler_y, *defined by us*, is used. However, for this scaling process to be possible, y_train data needs to be in the form of a **2D array**. *Therefore*,

y_train = y_train.reshape(len(y_train), 1)

the **reshape** command is used to reshape the y_train elements, transforming them into a 2D array.

This operation reshapes the dimension of the target variable in the dataset, converting it into a 2D array.

In [41]:
# StandardScaler() wants this format which is 2D array

# Reshape the y_train array to ensure it has a proper shape for certain operations
# This is done to ensure compatibility with some machine learning models
# len(y_train) gives the number of elements in y_train, and 1 represents the number of columns
y_train = y_train.reshape(len(y_train), 1)

# Print the reshaped y_train array to inspect the changes
print(y_train)

[[1000000]
 [  85000]
 [ 155000]
 [ 330000]
 [  55000]
 [ 152000]
 [  65000]
 [ 115000]
 [ 350000]
 [ 250000]
 [  45000]
 [1050000]
 [  60000]
 [  53000]
 [ 200000]
 [ 230000]
 [  80000]
 [  63000]
 [ 520000]
 [  50000]
 [  43000]
 [ 300000]
 [ 150000]
 [ 110000]]


In [42]:
# Import the necessary library for target variable scaling
from sklearn.preprocessing import StandardScaler

# Create an instance of the StandardScaler for the target variable
ss_scaler_y = StandardScaler()

# Scale (standardize) the target variable in the training set
# fit_transform is used to compute the mean and standard deviation
# and then apply the transformation to the data
y_train = ss_scaler_y.fit_transform(y_train)

In [43]:
print(y_train)

[[ 2.88035303e+00]
 [-5.40738026e-01]
 [-2.79015213e-01]
 [ 3.75291819e-01]
 [-6.52904946e-01]
 [-2.90231905e-01]
 [-6.15515972e-01]
 [-4.28571106e-01]
 [ 4.50069766e-01]
 [ 7.61800330e-02]
 [-6.90293919e-01]
 [ 3.06729789e+00]
 [-6.34210459e-01]
 [-6.60382740e-01]
 [-1.10764833e-01]
 [ 1.40208650e-03]
 [-5.59432513e-01]
 [-6.22993767e-01]
 [ 1.08568231e+00]
 [-6.71599432e-01]
 [-6.97771714e-01]
 [ 2.63124899e-01]
 [-2.97709700e-01]
 [-4.47265593e-01]]


In [44]:
# Restore y_train to its original SHAPE before applying the algorithm
y_train = y_train.reshape(len(y_train))
print(y_train)

[ 2.88035303e+00 -5.40738026e-01 -2.79015213e-01  3.75291819e-01
 -6.52904946e-01 -2.90231905e-01 -6.15515972e-01 -4.28571106e-01
  4.50069766e-01  7.61800330e-02 -6.90293919e-01  3.06729789e+00
 -6.34210459e-01 -6.60382740e-01 -1.10764833e-01  1.40208650e-03
 -5.59432513e-01 -6.22993767e-01  1.08568231e+00 -6.71599432e-01
 -6.97771714e-01  2.63124899e-01 -2.97709700e-01 -4.47265593e-01]


## Training the SVR model on the Training set

In [45]:
# Importing the Support Vector Regressor (SVR) from scikit-learn
from sklearn.svm import SVR

# Creating an instance of SVR with the radial basis function (RBF) kernel
regressor = SVR(kernel='rbf') # There are other parameters than 'rbf'

# Fitting the SVR model to the training data
regressor.fit(X_train, y_train)

## Predicting the Test set results

In [46]:
y_pred = regressor.predict(X_test)


# Here, the predicted values generated by the algorithm are in a standardized (scaling) format.
# That is, its output is in the following form.

print(y_pred)

[-0.59736988  1.67900483 -0.35808406 -0.45979305  1.16729438  1.16729438]


### Perform the inverse scaling operation on the output values generated by the algorithm, which is y_pred.
This way, they will be reverted to their original scale.

Above, the predicted values generated by the algorithm, **y_pred**, are in a standardized format. That is, its output is in the following form.

In [47]:
print(y_pred)

[-0.59736988  1.67900483 -0.35808406 -0.45979305  1.16729438  1.16729438]


Before calculating the performance of our model,

y_pred = ss_scaler_y.inverse_transform(y_pred)

the **y_pred** should be transformed back to its **original values** using the above command.

In [48]:
# Reshaping the predicted values to a column vector (2D array)
y_pred = y_pred.reshape(len(y_pred), 1)

# Printing the reshaped predicted values
print(y_pred)

[[-0.59736988]
 [ 1.67900483]
 [-0.35808406]
 [-0.45979305]
 [ 1.16729438]
 [ 1.16729438]]


In [49]:
# Return to the original SCALE.
y_pred = ss_scaler_y.inverse_transform(y_pred)
print(y_pred)

[[ 69853.32632009]
 [678689.1725956 ]
 [133852.36287054]
 [106649.42572704]
 [541827.8438246 ]
 [541827.8438246 ]]


In [50]:
# And transform it back to the original SHAPE.
y_pred = y_pred.reshape(len(y_pred))
print(y_pred)

[ 69853.32632009 678689.1725956  133852.36287054 106649.42572704
 541827.8438246  541827.8438246 ]


## Evaluating the Model Performance

In [51]:
y_test

array([  46000, 1030000,  113000,   83000,  550000,  500000])

In [52]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8278500764932027