In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score


In [3]:
df = pd.read_csv(r"D:\University\IMT\Python Codes\House_Price\Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [9]:
#Data Preprocessing
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [10]:
df.shape

(545, 13)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [12]:
# description for numerical features
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [13]:
df.duplicated().sum()

np.int64(0)

In [14]:
# Select binary columns
binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
df[binary_columns] = df[binary_columns].replace({'yes': 1, 'no': 0})

  df[binary_columns] = df[binary_columns].replace({'yes': 1, 'no': 0})


In [15]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [16]:
# there is order one hot encoding for categorical features
df_one_hot = pd.get_dummies(df , columns=['furnishingstatus'])

In [17]:
df_one_hot.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,True,False,False
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,True,False,False
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,False,True,False
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,True,False,False
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,True,False,False


In [18]:
# Label Encoding for categorical features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [19]:
df['furnishingstatus_encoded'] = le.fit_transform(df['furnishingstatus'])
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,furnishingstatus_encoded
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,0


In [20]:
# Split data to train and test
y = df['price']
X = df.drop(['price', 'furnishingstatus'], axis=1)

In [21]:
X

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_encoded
0,7420,4,2,3,1,0,0,0,1,2,1,0
1,8960,4,4,4,1,0,0,0,1,3,0,0
2,9960,3,2,2,1,0,1,0,0,2,1,1
3,7500,4,2,2,1,0,1,0,1,3,1,0
4,7420,4,1,2,1,1,1,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
540,3000,2,1,1,1,0,1,0,0,2,0,2
541,2400,3,1,1,0,0,0,0,0,0,0,1
542,3620,2,1,1,1,0,0,0,0,0,0,2
543,2910,3,1,1,0,0,0,0,0,0,0,0


In [22]:
y

0      13300000
1      12250000
2      12250000
3      12215000
4      11410000
         ...   
540     1820000
541     1767150
542     1750000
543     1750000
544     1750000
Name: price, Length: 545, dtype: int64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
scaler = StandardScaler()

In [25]:
# scale the features are numerical
numerical_columns = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])

In [26]:
X_train

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_encoded
46,0.384168,0.055271,1.539173,2.587644,1,0,0,0,1,0.367957,0,0
93,0.929181,0.055271,1.539173,-0.912499,1,0,1,0,1,2.709987,0,1
335,-0.607755,-1.283514,-0.557950,-0.912499,1,0,1,0,1,1.538972,0,0
412,-1.155492,0.055271,-0.557950,0.254215,1,0,1,0,0,-0.803059,1,2
471,-0.637730,0.055271,-0.557950,0.254215,1,0,0,0,0,-0.803059,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.384168,1.394055,1.539173,2.587644,1,0,0,0,1,-0.803059,0,2
106,0.134371,1.394055,1.539173,-0.912499,1,0,1,0,1,-0.803059,1,1
270,-0.297097,0.055271,1.539173,1.420929,1,0,0,1,0,0.367957,0,0
435,-0.506019,-1.283514,-0.557950,-0.912499,1,0,0,0,0,-0.803059,0,2


In [27]:
model = LinearRegression()
model.fit(X_train, y_train)

In [28]:
y_pred = model.predict(X_test)

In [29]:
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test, y_pred)

In [30]:
print(f"Mean Squared Error: {MSE}")
print(f"Root Mean Squared Error: {RMSE}")
print(f"R^2 Score: {R2}")

Mean Squared Error: 8.228277360674691e+18
Root Mean Squared Error: 2868497404.683276
R^2 Score: -1627887.6155753082
