In [1]:
#Q1. In order to predict house price based on several characteristics, such as location, square footage,
#number of bedrooms, etc., you are developing an SVM regression model. Which regression metric in this
#situation would be the best to employ?

In order to predict house price based on several characteristics, such as location, square footage, size, etc., we are developing an SVM regression model. The best regression metric in this situation would be the mean squared error (MSE)

The MSE measures the average squared difference between the predicted and actual values of the target variable, It is a popular metric for regression problems because it penalizes large errors more heavily than small errors

In [1]:
# Import the necessary libraries and load the dataset
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from warnings import filterwarnings
from sklearn.metrics import RocCurveDisplay
from datetime import datetime
from sklearn.metrics import mean_squared_error
%matplotlib inline
filterwarnings('ignore')


df=pd.read_csv('https://raw.githubusercontent.com/abhimukh19/datasets/main/Bengaluru_House_Data.csv')
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [2]:
#Stripping spaces across all values in dataframe
df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
df.columns = df.columns.str.strip()

In [3]:
#Check for the missing/null values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [4]:
#check if there are duplicate values
df.duplicated().sum()

530

In [5]:
#Replacing missing values with next value
df.fillna(method='ffill', inplace=True)


In [6]:
df[['HomeSize','Bedrooms']] = df['size'].str.split(' ',1,expand=True)


In [7]:
df=df.drop('Bedrooms',axis=1)
df=df.drop('size',axis=1)


In [8]:
df['availability'] = df['availability'].replace('Ready To Move', datetime.today().strftime('%d-%b'))
df

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,HomeSize
0,Super built-up Area,19-Dec,Electronic City Phase II,Coomee,1056,2.0,1.0,39.07,2
1,Plot Area,09-Apr,Chikka Tirupathi,Theanmp,2600,5.0,3.0,120.00,4
2,Built-up Area,09-Apr,Uttarahalli,Theanmp,1440,2.0,3.0,62.00,3
3,Super built-up Area,09-Apr,Lingadheeranahalli,Soiewre,1521,3.0,1.0,95.00,3
4,Super built-up Area,09-Apr,Kothanur,Soiewre,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,09-Apr,Whitefield,ArsiaEx,3453,4.0,0.0,231.00,5
13316,Super built-up Area,09-Apr,Richards Town,ArsiaEx,3600,5.0,0.0,400.00,4
13317,Built-up Area,09-Apr,Raja Rajeshwari Nagar,Mahla T,1141,2.0,1.0,60.00,2
13318,Super built-up Area,18-Jun,Padmanabhanagar,SollyCl,4689,4.0,1.0,488.00,4


In [9]:
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
onh=preprocessing.OrdinalEncoder()
df['area_type']=onh.fit_transform(df[['area_type']])
df

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,HomeSize
0,3.0,19-Dec,Electronic City Phase II,Coomee,1056,2.0,1.0,39.07,2
1,2.0,09-Apr,Chikka Tirupathi,Theanmp,2600,5.0,3.0,120.00,4
2,0.0,09-Apr,Uttarahalli,Theanmp,1440,2.0,3.0,62.00,3
3,3.0,09-Apr,Lingadheeranahalli,Soiewre,1521,3.0,1.0,95.00,3
4,3.0,09-Apr,Kothanur,Soiewre,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...,...
13315,0.0,09-Apr,Whitefield,ArsiaEx,3453,4.0,0.0,231.00,5
13316,3.0,09-Apr,Richards Town,ArsiaEx,3600,5.0,0.0,400.00,4
13317,0.0,09-Apr,Raja Rajeshwari Nagar,Mahla T,1141,2.0,1.0,60.00,2
13318,3.0,18-Jun,Padmanabhanagar,SollyCl,4689,4.0,1.0,488.00,4


In [10]:
onh=preprocessing.OrdinalEncoder()
df['location']=onh.fit_transform(df[['location']])
df

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,HomeSize
0,3.0,19-Dec,405.0,Coomee,1056,2.0,1.0,39.07,2
1,2.0,09-Apr,303.0,Theanmp,2600,5.0,3.0,120.00,4
2,0.0,09-Apr,1166.0,Theanmp,1440,2.0,3.0,62.00,3
3,3.0,09-Apr,743.0,Soiewre,1521,3.0,1.0,95.00,3
4,3.0,09-Apr,702.0,Soiewre,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...,...
13315,0.0,09-Apr,1239.0,ArsiaEx,3453,4.0,0.0,231.00,5
13316,3.0,09-Apr,991.0,ArsiaEx,3600,5.0,0.0,400.00,4
13317,0.0,09-Apr,959.0,Mahla T,1141,2.0,1.0,60.00,2
13318,3.0,18-Jun,894.0,SollyCl,4689,4.0,1.0,488.00,4


In [11]:
onh=preprocessing.OrdinalEncoder()
df['society']=onh.fit_transform(df[['society']])
df

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,HomeSize
0,3.0,19-Dec,405.0,464.0,1056,2.0,1.0,39.07,2
1,2.0,09-Apr,303.0,2439.0,2600,5.0,3.0,120.00,4
2,0.0,09-Apr,1166.0,2439.0,1440,2.0,3.0,62.00,3
3,3.0,09-Apr,743.0,2186.0,1521,3.0,1.0,95.00,3
4,3.0,09-Apr,702.0,2186.0,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...,...
13315,0.0,09-Apr,1239.0,209.0,3453,4.0,0.0,231.00,5
13316,3.0,09-Apr,991.0,209.0,3600,5.0,0.0,400.00,4
13317,0.0,09-Apr,959.0,1216.0,1141,2.0,1.0,60.00,2
13318,3.0,18-Jun,894.0,2205.0,4689,4.0,1.0,488.00,4


In [12]:
onh=preprocessing.OrdinalEncoder()
df['availability']=onh.fit_transform(df[['availability']])
df

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,HomeSize
0,3.0,41.0,405.0,464.0,1056,2.0,1.0,39.07,2
1,2.0,0.0,303.0,2439.0,2600,5.0,3.0,120.00,4
2,0.0,0.0,1166.0,2439.0,1440,2.0,3.0,62.00,3
3,3.0,0.0,743.0,2186.0,1521,3.0,1.0,95.00,3
4,3.0,0.0,702.0,2186.0,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...,...
13315,0.0,0.0,1239.0,209.0,3453,4.0,0.0,231.00,5
13316,3.0,0.0,991.0,209.0,3600,5.0,0.0,400.00,4
13317,0.0,0.0,959.0,1216.0,1141,2.0,1.0,60.00,2
13318,3.0,33.0,894.0,2205.0,4689,4.0,1.0,488.00,4


In [13]:
df = df[~df['total_sqft'].str.contains('[^a-zA-Z0-9]+')]
df = df[~df['total_sqft'].str.contains('Perch')]
df = df[~df['total_sqft'].str.contains('Acres')]
df = df[~df['total_sqft'].str.contains('Cents')]
df = df[~df['total_sqft'].str.contains('Guntha')]
df = df[~df['total_sqft'].str.contains('1Grounds')]

df['total_sqft'] = df['total_sqft'].str.replace('Sq. Meter', '')
df['total_sqft'] = pd.to_numeric(df['total_sqft'].str.replace('Sq. Yards', ''))* 0.83612736

df


Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,HomeSize
0,3.0,41.0,405.0,464.0,882.950492,2.0,1.0,39.07,2
1,2.0,0.0,303.0,2439.0,2173.931136,5.0,3.0,120.00,4
2,0.0,0.0,1166.0,2439.0,1204.023398,2.0,3.0,62.00,3
3,3.0,0.0,743.0,2186.0,1271.749715,3.0,1.0,95.00,3
4,3.0,0.0,702.0,2186.0,1003.352832,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...,...
13315,0.0,0.0,1239.0,209.0,2887.147774,4.0,0.0,231.00,5
13316,3.0,0.0,991.0,209.0,3010.058496,5.0,0.0,400.00,4
13317,0.0,0.0,959.0,1216.0,954.021318,2.0,1.0,60.00,2
13318,3.0,33.0,894.0,2205.0,3920.601191,4.0,1.0,488.00,4


In [15]:
#Split the da#taset into training and testing sets

X=df.drop('price', axis=1)
y=df['price']

X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3, random_state=40)
X_train, X_test, y_train, y_test

(       area_type  availability  location  society   total_sqft  bath  balcony  \
 3298         0.0           0.0    1239.0   1851.0   961.546464   2.0      0.0   
 2939         3.0          28.0     492.0   1862.0  1024.256016   2.0      1.0   
 832          3.0           0.0      58.0    353.0  1212.384672   2.0      1.0   
 12301        0.0           0.0     949.0   1477.0  1193.153743   2.0      2.0   
 4506         3.0           0.0     189.0   1568.0  1091.146205   2.0      1.0   
 ...          ...           ...       ...      ...          ...   ...      ...   
 11834        3.0          35.0     656.0   2558.0  1701.519178   3.0      2.0   
 5564         0.0          56.0    1026.0   1357.0  1621.250951   3.0      0.0   
 7820         0.0           0.0      42.0   1475.0  2006.705664   5.0      3.0   
 11553        0.0           0.0     464.0    388.0  3344.509440   5.0      1.0   
 11892        3.0           0.0     508.0   1154.0  1398.004946   3.0      2.0   
 
       HomeSiz

In [None]:
# Create SVM regression model
model = SVR(kernel='linear')
model.fit(X_train,y_train)
# Predict target variable
y_pred = model.predict(X_test)

# Calculate RMSE
mse = mean_squared_error(y_true, y_pred)
print('mse:', mse)

In [None]:
#Q2. You have built an SVM regression model and are trying to decide between using MSE or R-squared as
#your evaluation metric. Which metric would be more appropriate if your goal is to predict the actual price
#of a house as accurately as possible?

If our goal is to predict the actual price of a house as accurately as possible, then we should use MSE (Mean Squared Error) as the evaluation metric

R-squared does not tell us how close the predicted values are to the actual values, which is important in this case.

In [22]:
#Q3. You have a dataset with a significant number of outliers and are trying to select an appropriate
#regression metric to use with your SVM model. Which metric would be the most appropriate in this
#scenario?

If we have a dataset with a significant number of outliers and are trying to select an appropriate regression metric to use with the SVM model, then we should use Mean Absolute Error (MAE) as our evaluation metric. MAE is less sensitive to outliers than MSE because it does not square the errors.

SVM is also robust to outliers, so it can handle datasets with a significant number of outliers

In [None]:
#Q4. You have built an SVM regression model using a polynomial kernel and are trying to select the best
#metric to evaluate its performance. You have calculated both MSE and RMSE and found that both values
#are very close. Which metric should you choose to use in this case?

Both MSE and RMSE are used to evaluate regression models, but they have different interpretations. 

MSE is the average of the squared differences between predicted and actual values, while RMSE is the square root of MSE. 

RMSE is preferred over MSE when we want to penalize large errors more than smaller ones because it squares the errors before averaging them.

Since both values are very close, we can choose either metric to evaluate the model’s performance.

In [None]:
#Q5. You are comparing the performance of different SVM regression models using different kernels (linear, polynomial, and RBF) and are trying to select the best evaluation metric. Which metric would be most appropriate if your goal is to measure how well the model explains the variance in the target variable?

The most appropriate metric for measuring how well the model explains the variance in the target variable is Mean Squared Error (MSE), which measures the average squared difference between the predicted and actual values