In [1]:

# importing requirements

import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 12

In [2]:
house_df = pd.read_csv('Real estate.csv')

In [3]:
house_df

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


## Data Cleaning

In [4]:
# Removing the invalid date and month (just considering year)
house_df['X1 transaction date'] = house_df['X1 transaction date'].astype(int)

In [5]:
house_df

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013,8.1,104.81010,5,24.96674,121.54067,52.5


In [6]:
# Changing the column names
house_df.rename(columns = {'X1 transaction date' : 'transaction_year'}, inplace = True)
house_df.rename(columns = {'X2 house age' : 'house_age'}, inplace = True)
house_df.rename(columns = {'X3 distance to the nearest MRT station' : 'nearest_mrt_station'}, inplace = True)
house_df.rename(columns = {'X4 number of convenience stores' : 'no_of_stores_nearby'}, inplace = True)
house_df.rename(columns = {'X5 latitude' : 'latitude'}, inplace = True)
house_df.rename(columns = {'X6 longitude' : 'longitude'}, inplace = True)
house_df.rename(columns = {'Y house price of unit area' : 'house_price_of_unit_area'}, inplace = True)

In [7]:
house_df['house_age'] = house_df['house_age'].astype(int)

In [8]:
house_df

Unnamed: 0,No,transaction_year,house_age,nearest_mrt_station,no_of_stores_nearby,latitude,longitude,house_price_of_unit_area
0,1,2012,32,84.87882,10,24.98298,121.54024,37.9
1,2,2012,19,306.59470,9,24.98034,121.53951,42.2
2,3,2013,13,561.98450,5,24.98746,121.54391,47.3
3,4,2013,13,561.98450,5,24.98746,121.54391,54.8
4,5,2012,5,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013,13,4082.01500,0,24.94155,121.50381,15.4
410,411,2012,5,90.45606,9,24.97433,121.54310,50.0
411,412,2013,18,390.96960,7,24.97923,121.53986,40.6
412,413,2013,8,104.81010,5,24.96674,121.54067,52.5


In [9]:
# All other data seems proper
house_df.describe()

Unnamed: 0,No,transaction_year,house_age,nearest_mrt_station,no_of_stores_nearby,latitude,longitude,house_price_of_unit_area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,207.5,2012.695652,17.292271,1083.885689,4.094203,24.96903,121.533361,37.980193
std,119.655756,0.460687,11.333769,1262.109595,2.945562,0.01241,0.015347,13.606488
min,1.0,2012.0,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,104.25,2012.0,9.0,289.3248,1.0,24.963,121.528085,27.7
50%,207.5,2013.0,16.0,492.2313,4.0,24.9711,121.53863,38.45
75%,310.75,2013.0,28.0,1454.279,6.0,24.977455,121.543305,46.6
max,414.0,2013.0,43.0,6488.021,10.0,25.01459,121.56627,117.5


## EDA Data Visualization
### 1. Effect of house_age on price

In [10]:
px.scatter(house_df, x = 'house_age', y = 'house_price_of_unit_area', title = 'Effect of house_age on price')

#### Conclusion : clearly seeing that house_age doesn't have any major effect on price

### 2. Effect of nearest_mrt_station on price

In [11]:
px.scatter(house_df, x = 'nearest_mrt_station', y = 'house_price_of_unit_area', title = 'Effect of nearest_mrt_station on price' )

#### Conclusion : Price increases for the house which have a mrt_station near them

### 3. Effect of no_of_stores_nearby on price

In [13]:
px.scatter(house_df, x = 'no_of_stores_nearby', y = 'house_price_of_unit_area', title = 'Effect of no_of_stores_nearby on price')

#### Conclusion : We can see the increase in bulk as the no. of shops nearby house increases

### 4. Effect of latitude on price

In [14]:
px.scatter(house_df, x = 'latitude', y = 'house_price_of_unit_area', title = 'Effect of latitude on price')

#### Conclusion : As latitude increases, price increases slighty

### 5. Effect of longitude on price

In [15]:
px.scatter(house_df, x = 'longitude', y = 'house_price_of_unit_area', title = 'Effect of longitude on price')

#### Conclusion : As longitude increases, price increases slighty

### 6. Effect of year on price

In [None]:
plt.title('Effect of years on price')
sns.barplot(x = 'transaction_year', y = 'house_price_of_unit_area', data = house_df, color = 'slateblue')

#### Conclusion : Year doesn't have a noticable effect on price

In [26]:
X = house_df[['house_age', 'nearest_mrt_station', 'no_of_stores_nearby', 'latitude', 'longitude']]
y = house_df['house_price_of_unit_area']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [28]:
X_test.shape

(42, 5)

### model training 

In [29]:
model = LinearRegression()

In [30]:
model.fit(X_train, y_train)

In [31]:
model.predict(X_test)

array([47.87043173, 41.85559913, 44.71261269, 41.46212663, 30.02969635,
       42.2110399 , 45.82434602, 45.53154117, 24.72866294, 52.0065668 ,
       31.9105569 , 34.82561007, 39.35008971, 24.78918294, 35.03545351,
       33.04943707, 41.37752125, 47.26545783, 31.32519673, 44.22527536,
        1.87256955, 33.63082397, 47.71291933, 43.54446892, 14.21001309,
       41.42002695, 14.69731377, 44.71261269, 35.64006088, 36.84492772,
       11.53445739, 39.22795105, 37.42861944, 28.41802485, 45.6622215 ,
       31.07329565, 52.29937164, 15.35522096, 46.18472163, 39.65003576,
       35.36664574, 40.23514214])

#### Conclusion: if house is 1 yr old, have an MRT station 193 km away, have 6 nearby stores, latitude is 24, longitude is 121 and is situated in southeast region then :
#### Actual Price(per unit area) = $45.1
#### Predicted Price(per unit area) = $48.7

## Weights and variance

In [32]:
model.coef_

array([-2.92804847e-01, -4.67399755e-03,  1.08330387e+00,  2.24277186e+02,
       -2.94315553e+01])

In [33]:
type(model.coef_)

numpy.ndarray

In [35]:
df = pd.DataFrame(data = model.coef_, columns=['Coefficient'], index = ['b1','b2','b3','b4','b5'])

In [36]:
df

Unnamed: 0,Coefficient
b1,-0.292805
b2,-0.004674
b3,1.083304
b4,224.277186
b5,-29.431555


Conclusion : Value of weight b3 and b4 are high. b3 corresponds to no. of stores nearby & b4 corresponds to latitude.
This means that the house price (per uit area) highly depends on no. of stores nearby and latitude.

### Regression Metrics

In [38]:
y_actual = y_test
y_predicted = model.predict(X_test)

In [39]:
print(f'MAE : ${mean_absolute_error(y_actual, y_predicted)}')

MAE : $6.104464553977398


In [40]:
print(f'RMSE : ${np.sqrt(mean_squared_error(y_actual, y_predicted))}')

RMSE : $8.422293171673784


In [41]:
print(f'R2 Score : {r2_score(y_actual, y_predicted)}')

R2 Score : 0.6380644701298452
