In [102]:
# import required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error

In [20]:
# load and read dataset
dataset = pd.read_csv('Housing.csv')
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## Data Preprocessing

In [21]:
# describe data
dataset.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [22]:
# find further info about dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [23]:
# find all columns
dataset.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [27]:
# find unique values in mainroad column
dataset.mainroad.value_counts()

mainroad
1    468
0     77
Name: count, dtype: int64

In [25]:
# perform label encoding as there are only two unique values in mainroad column
label_encoder = LabelEncoder()
dataset['mainroad'] = label_encoder.fit_transform(dataset['mainroad'])

In [26]:
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,1,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,1,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,1,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,1,yes,yes,no,yes,2,no,furnished


In [29]:
# find unique values for parking
dataset.guestroom.value_counts()

guestroom
no     448
yes     97
Name: count, dtype: int64

In [30]:
# now apply label encoding
dataset['guestroom'] = label_encoder.fit_transform(dataset.guestroom)
dataset.guestroom.head()

0    0
1    0
2    0
3    0
4    1
Name: guestroom, dtype: int32

In [31]:
# now find unique values in guestroom
dataset.guestroom.value_counts()

guestroom
0    448
1     97
Name: count, dtype: int64

In [9]:
# find unique values in basement
dataset.basement.value_counts()

basement
no     354
yes    191
Name: count, dtype: int64

In [10]:
# find unique values in hotwaterheating
dataset.hotwaterheating.value_counts()

hotwaterheating
no     520
yes     25
Name: count, dtype: int64

In [32]:
# apply label encoding on hotwaterheating
dataset['hotwaterheating'] = label_encoder.fit_transform(dataset.hotwaterheating)
dataset.hotwaterheating

0      0
1      0
2      0
3      0
4      0
      ..
540    0
541    0
542    0
543    0
544    0
Name: hotwaterheating, Length: 545, dtype: int32

In [33]:
# now find unique values
dataset.hotwaterheating.value_counts()

hotwaterheating
0    520
1     25
Name: count, dtype: int64

In [11]:
# find unique values in airconditioning
dataset.airconditioning.value_counts()

airconditioning
no     373
yes    172
Name: count, dtype: int64

In [34]:
# apply label encoding
dataset['airconditioning'] = label_encoder.fit_transform(dataset.airconditioning)
dataset.airconditioning

0      1
1      1
2      0
3      1
4      1
      ..
540    0
541    0
542    0
543    0
544    0
Name: airconditioning, Length: 545, dtype: int32

In [35]:
# unique values in airconditioning
dataset.airconditioning.value_counts()

airconditioning
0    373
1    172
Name: count, dtype: int64

In [36]:
# now display dataset
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,no,0,1,2,yes,furnished
1,12250000,8960,4,4,4,1,0,no,0,1,3,no,furnished
2,12250000,9960,3,2,2,1,0,yes,0,0,2,yes,semi-furnished
3,12215000,7500,4,2,2,1,0,yes,0,1,3,yes,furnished
4,11410000,7420,4,1,2,1,1,yes,0,1,2,no,furnished


In [37]:
# find unique values in basement
dataset['basement'].value_counts()

basement
no     354
yes    191
Name: count, dtype: int64

In [38]:
# now apply label encoding
dataset.basement = label_encoder.fit_transform(dataset.basement)
dataset.basement

0      0
1      0
2      1
3      1
4      1
      ..
540    1
541    0
542    0
543    0
544    0
Name: basement, Length: 545, dtype: int32

In [39]:
# now find unique values
dataset.basement.value_counts()

basement
0    354
1    191
Name: count, dtype: int64

In [41]:
# unique values in prefarea
dataset.prefarea.value_counts()

prefarea
no     417
yes    128
Name: count, dtype: int64

In [42]:
# apply label encoding
dataset['prefarea'] = label_encoder.fit_transform(dataset.prefarea)
dataset.prefarea

0      1
1      0
2      1
3      1
4      0
      ..
540    0
541    0
542    0
543    0
544    0
Name: prefarea, Length: 545, dtype: int32

In [43]:
# now again find unique values
dataset.prefarea.value_counts()

prefarea
0    417
1    128
Name: count, dtype: int64

In [44]:
'''
no: 0
yes: 1
'''

'\nno: 0\nyes: 1\n'

In [45]:
# furnishing status
dataset.furnishingstatus.value_counts()

furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64

In [46]:
# apply label encoding
dataset.furnishingstatus = label_encoder.fit_transform(dataset['furnishingstatus'])
dataset['furnishingstatus']

0      0
1      0
2      1
3      0
4      0
      ..
540    2
541    1
542    2
543    0
544    2
Name: furnishingstatus, Length: 545, dtype: int32

In [47]:
# now again find unique values
dataset.furnishingstatus.value_counts()

furnishingstatus
1    227
2    178
0    140
Name: count, dtype: int64

In [48]:
'''
semi-furnished: 1
unfurnished: 2
furnished: 0
'''

'\nsemi-furnished: 1\nunfurnished: 2\nfurnished: 0\n'

In [49]:
# now show final version of dataset
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [50]:
# now again find info
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   price             545 non-null    int64
 1   area              545 non-null    int64
 2   bedrooms          545 non-null    int64
 3   bathrooms         545 non-null    int64
 4   stories           545 non-null    int64
 5   mainroad          545 non-null    int32
 6   guestroom         545 non-null    int32
 7   basement          545 non-null    int32
 8   hotwaterheating   545 non-null    int32
 9   airconditioning   545 non-null    int32
 10  parking           545 non-null    int64
 11  prefarea          545 non-null    int32
 12  furnishingstatus  545 non-null    int32
dtypes: int32(7), int64(6)
memory usage: 40.6 KB


In [53]:
# find correlation with respect to price
cor = dataset.corr()['price']
cor

price               1.000000
area                0.535997
bedrooms            0.366494
bathrooms           0.517545
stories             0.420712
mainroad            0.296898
guestroom           0.255517
basement            0.187057
hotwaterheating     0.093073
airconditioning     0.452954
parking             0.384394
prefarea            0.329777
furnishingstatus   -0.304721
Name: price, dtype: float64

In [55]:
cor.index

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [56]:
cor.values

array([ 1.        ,  0.53599735,  0.36649403,  0.51754534,  0.42071237,
        0.29689849,  0.25551729,  0.1870566 ,  0.09307284,  0.45295408,
        0.38439365,  0.32977705, -0.30472146])

In [64]:
cor[((cor.values > 0.2) | (cor.values < -0.2))]

price               1.000000
area                0.535997
bedrooms            0.366494
bathrooms           0.517545
stories             0.420712
mainroad            0.296898
guestroom           0.255517
airconditioning     0.452954
parking             0.384394
prefarea            0.329777
furnishingstatus   -0.304721
Name: price, dtype: float64

In [67]:
cor.index

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [75]:
# divide data into dependent and independent variables
x = dataset[['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']]
y = dataset['price']
x.head(2)

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,1,0,0,0,1,2,1,0
1,8960,4,4,4,1,0,0,0,1,3,0,0


In [104]:
# apply standard scaling
scaler = StandardScaler()

x = scaler.fit_transform(x)

In [105]:
# length of x and y
print(len(x))
print(len(y))

545
545


In [106]:
0.8*len(x)

436.0

In [108]:
# split x dataset into training and testing
x_train = x[ : 436, : ]
x_test = x[436: ,  : ]

x_test

array([[-1.38622441,  0.04727831, -0.57018671, ..., -0.80574124,
         1.80494113, -1.40628573],
       [ 0.33644305,  0.04727831, -0.57018671, ...,  0.35597563,
        -0.55403469,  1.22296203],
       [-0.30004453, -1.30886273, -0.57018671, ..., -0.80574124,
        -0.55403469, -0.09166185],
       ...,
       [-0.70592066, -1.30886273, -0.57018671, ..., -0.80574124,
        -0.55403469,  1.22296203],
       [-1.03338891,  0.04727831, -0.57018671, ..., -0.80574124,
        -0.55403469, -1.40628573],
       [-0.5998394 ,  0.04727831, -0.57018671, ..., -0.80574124,
        -0.55403469,  1.22296203]])

In [87]:
# split y dataset into training and testing
y_train = y.iloc[ : 436]
y_test = y.iloc[436: ]
y_test

436    3290000
437    3290000
438    3255000
439    3255000
440    3234000
        ...   
540    1820000
541    1767150
542    1750000
543    1750000
544    1750000
Name: price, Length: 109, dtype: int64

## Apply Linear Regression

In [96]:
# prepare and train model
model = LinearRegression()

# fit the data
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
len(y_pred)

109

In [97]:
len(y_test)

109

In [99]:
# find r2 score
r2_score(y_test, y_pred)

-5.271733174190881

In [101]:
mean_absolute_error(y_test, y_pred)

829066.0883222377