In [1]:
# Connecting Google colab with Google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Importing required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error

In [3]:
# Reading the 'Saratoga_House_Prices.csv' file
# Store it in a variable called data
data = pd.read_csv('/content/drive/MyDrive/Datasets/Saratoga_House_Prices.csv')
data.head()

Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating,fuel,sewer,waterfront,newConstruction,centralAir
0,132500,0.09,42,50000,906,35,2,1,1.0,5,electric,electric,septic,No,No,No
1,181115,0.92,0,22300,1953,51,3,0,2.5,6,hot water/steam,gas,septic,No,No,No
2,109000,0.19,133,7300,1944,51,4,1,1.0,8,hot water/steam,gas,public/commercial,No,No,No
3,155000,0.41,13,18700,1944,51,3,1,1.5,5,hot air,gas,septic,No,No,No
4,86060,0.11,0,15000,840,51,2,0,1.0,3,hot air,gas,public/commercial,No,Yes,Yes


In [4]:
# Get the dimension of the data; number of rows and columns
print('Number of rows is:', data.shape[0])
print('Number of columns is:', data.shape[1])

Number of rows is: 1728
Number of columns is: 16


In [5]:
# Get the column names of the data
data.columns

Index(['price', 'lotSize', 'age', 'landValue', 'livingArea', 'pctCollege',
       'bedrooms', 'fireplaces', 'bathrooms', 'rooms', 'heating', 'fuel',
       'sewer', 'waterfront', 'newConstruction', 'centralAir'],
      dtype='object')

In [6]:
# Get the column types of the data
data.dtypes

Unnamed: 0,0
price,int64
lotSize,float64
age,int64
landValue,int64
livingArea,int64
pctCollege,int64
bedrooms,int64
fireplaces,int64
bathrooms,float64
rooms,int64


In [7]:
# Check for any missing values in the data
data.isnull().sum() # The result says no missing value

Unnamed: 0,0
price,0
lotSize,0
age,0
landValue,0
livingArea,0
pctCollege,0
bedrooms,0
fireplaces,0
bathrooms,0
rooms,0


In [8]:
# As I can see some age column value is 0, which is not convinient
# So, I will replace them with the median value of the 'age' column

# Calculate the median of the 'age' column, excluding zeros
median_age = data.loc[data['age'] != 0, 'age'].median()

# Replace 0 values in the 'age' column with the calculated median
data['age'] = data['age'].replace(0, median_age)

# Check the result
data.head()

Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating,fuel,sewer,waterfront,newConstruction,centralAir
0,132500,0.09,42,50000,906,35,2,1,1.0,5,electric,electric,septic,No,No,No
1,181115,0.92,19,22300,1953,51,3,0,2.5,6,hot water/steam,gas,septic,No,No,No
2,109000,0.19,133,7300,1944,51,4,1,1.0,8,hot water/steam,gas,public/commercial,No,No,No
3,155000,0.41,13,18700,1944,51,3,1,1.5,5,hot air,gas,septic,No,No,No
4,86060,0.11,19,15000,840,51,2,0,1.0,3,hot air,gas,public/commercial,No,Yes,Yes


In [9]:
data.head()

Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating,fuel,sewer,waterfront,newConstruction,centralAir
0,132500,0.09,42,50000,906,35,2,1,1.0,5,electric,electric,septic,No,No,No
1,181115,0.92,19,22300,1953,51,3,0,2.5,6,hot water/steam,gas,septic,No,No,No
2,109000,0.19,133,7300,1944,51,4,1,1.0,8,hot water/steam,gas,public/commercial,No,No,No
3,155000,0.41,13,18700,1944,51,3,1,1.5,5,hot air,gas,septic,No,No,No
4,86060,0.11,19,15000,840,51,2,0,1.0,3,hot air,gas,public/commercial,No,Yes,Yes


In [10]:
# I dont need so much columns
# So, I am going to drop some columns that I am not considering fot building model

data_v1 = data.drop(['sewer', 'newConstruction'], axis = 1)
data_v1.head()

Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating,fuel,waterfront,centralAir
0,132500,0.09,42,50000,906,35,2,1,1.0,5,electric,electric,No,No
1,181115,0.92,19,22300,1953,51,3,0,2.5,6,hot water/steam,gas,No,No
2,109000,0.19,133,7300,1944,51,4,1,1.0,8,hot water/steam,gas,No,No
3,155000,0.41,13,18700,1944,51,3,1,1.5,5,hot air,gas,No,No
4,86060,0.11,19,15000,840,51,2,0,1.0,3,hot air,gas,No,Yes


In [11]:
# Exploring Categorical columns before model building
# The last four columns of data_v1 are categorical and ofcourse nominal (as there is no order)

# Print unique value of the 'heating' column
print('Distinct values of heating column are:',data_v1['heating'].unique())

# Print unique value of the 'fuel' column
print('Distinct values of fuel column are:',data_v1['fuel'].unique())

# Print unique value of the 'waterfront' column
print('Distinct values of waterfront column are:',data_v1['waterfront'].unique())

# Print unique value of the 'centralAir' column
print('Distinct values of centralAir column are:',data_v1['centralAir'].unique())

Distinct values of heating column are: ['electric' 'hot water/steam' 'hot air']
Distinct values of fuel column are: ['electric' 'gas' 'oil']
Distinct values of waterfront column are: ['No' 'Yes']
Distinct values of centralAir column are: ['No' 'Yes']


In [12]:
# Define the OneHotEncoder object as ohe
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False).set_output(transform = 'pandas')

In [13]:
# Applying the ohe object on the four categorical columns and storing the result in ohe_data dataframe
ohe_data = ohe.fit_transform(data_v1[['heating', 'fuel', 'waterfront', 'centralAir']])
ohe_data

Unnamed: 0,heating_electric,heating_hot air,heating_hot water/steam,fuel_electric,fuel_gas,fuel_oil,waterfront_No,waterfront_Yes,centralAir_No,centralAir_Yes
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1723,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1724,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1725,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1726,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [14]:
# Merging the data_v1 data frame with ohe_data data frame
# Delete the main columns named 'heating', 'fuel', 'waterfront' and 'centralAir'
data_v2 = pd.concat([data_v1, ohe_data], axis = 1).drop(['heating','fuel', 'waterfront', 'centralAir'], axis = 1)
data_v2

Unnamed: 0,price,lotSize,age,landValue,livingArea,pctCollege,bedrooms,fireplaces,bathrooms,rooms,heating_electric,heating_hot air,heating_hot water/steam,fuel_electric,fuel_gas,fuel_oil,waterfront_No,waterfront_Yes,centralAir_No,centralAir_Yes
0,132500,0.09,42,50000,906,35,2,1,1.0,5,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,181115,0.92,19,22300,1953,51,3,0,2.5,6,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,109000,0.19,133,7300,1944,51,4,1,1.0,8,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,155000,0.41,13,18700,1944,51,3,1,1.5,5,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,86060,0.11,19,15000,840,51,2,0,1.0,3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,289000,0.38,32,24200,2310,61,5,1,2.5,11,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1724,285000,0.94,37,36200,2564,61,4,1,2.5,11,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1725,194900,0.39,9,20400,1099,51,2,0,1.0,3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1726,125000,0.24,48,16800,1225,51,3,1,1.0,7,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [15]:
# Setting the output variable (y) and input variables (X) for the model
X = data_v2.drop(['price'], axis = 1)
y = data_v2['price']

In [16]:
# Train test split the data set for training and testing the model
# In general 80% data is used for training purpose
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2, random_state = 10)
print('Shape of X_train is:', X_train.shape)
print('Shape of X_test is:', X_test.shape)
print('Shape of y_train is:', y_train.shape)
print('Shape of y_test is:', y_test.shape)

Shape of X_train is: (1382, 19)
Shape of X_test is: (346, 19)
Shape of y_train is: (1382,)
Shape of y_test is: (346,)


In [17]:
# Defining the LinearRegression object and storing it in a variable called model
model = LinearRegression()

In [18]:
# Fit the training data (X_train and y_train) to the model named 'model'
model.fit(X_train, y_train)

In [19]:
# Predicting the model outcome using X_test and storing it into y_pred variable
y_pred = model.predict(X_test)

In [20]:
# Checking accuracy of the model called 'model'
# The evaluation metrices that are used are: r2_score, root_mean_squared_error and accuracy_score
print('The R2 value for the model is:', r2_score(y_pred, y_test))
print('The RMSE value for the model is:', root_mean_squared_error(y_pred, y_test))
print('The MAE value for the model is:', mean_absolute_error(y_pred, y_test))
print('The MSE value for the model is:', mean_squared_error(y_pred, y_test))

The R2 value for the model is: 0.5291253392355719
The RMSE value for the model is: 53530.061234990026
The MAE value for the model is: 39671.15633042291
The MSE value for the model is: 2865467455.821782


#### So, my linear regression model called 'model' has a R2 value of approximately 0.53. It means that approximately 53% variability in the data is explained by the model. In later, some techniques will be discussed to make the R2 value more close to 1