In [16]:
# Import the relevant libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression

In [17]:
# Load the data

data = pd.read_csv('real_estate_price_size_year.csv')

In [18]:
data.head()

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009


In [19]:
data.describe()

Unnamed: 0,price,size,year
count,100.0,100.0,100.0
mean,292289.47016,853.0242,2012.6
std,77051.727525,297.941951,4.729021
min,154282.128,479.75,2006.0
25%,234280.148,643.33,2009.0
50%,280590.716,696.405,2015.0
75%,335723.696,1029.3225,2018.0
max,500681.128,1842.51,2018.0


In [20]:
# Create the regression

# Declare the dependent and independent variable

x = data[['size', 'year']]
y = data['price']

In [21]:
# Regression

reg = LinearRegression() #reg is an instance of the linear regression class
reg.fit(x,y) #fit the regression

In [22]:
# Find the intercept

reg.intercept_

-5772267.017463276

In [23]:
# Find the coefficients

reg.coef_

array([ 227.70085401, 2916.78532684])

In [24]:
# Calculate the R-squared (measure of goodness of it of your model)
# R-squared is the variability explained by the regression divided by total variability of the dataset. In field such as chemistry and physics R2 ~ 0.7 - 0.99. In social sciences e.g. economics R2 ~ 0. could be ok depending on number of variables. 
# Total variabilty = Variability explained by regression (SSR) + Unexplained variability (SSE)

reg.score(x,y)

0.7764803683276791

In [25]:
# size and year explain 77.6% of the variabilty of price of apartments in our sample

In [30]:
# Calculate the Adjusted R-squared (Penalizes the excessive use of variables)
# Adjusted R-sqaured 
#R2adj. = 1 - (1-R2) * n-1/n-p-1
# Change to markdown and write the formuale in latex code for a nice display in Jupyter 

$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [29]:
x.shape

(100, 2)

In [35]:
r2 = reg.score(x,y)
n = x.shape[0]
p = x.shape[1]

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

0.7718717161282498

In [None]:
# Compare the R-squared and the Adjusted R-squared


In [32]:
# Making predictions
# Find the price of an apartment that has a size of 750 sq.ft. from 2009

new_data = pd.DataFrame(data=[[750,2009]], columns=['size','year'])
new_data

Unnamed: 0,size,year
0,750,2009


In [33]:
reg.predict(new_data)

array([258330.34465995])

In [None]:
# the price of a 750 sq.ft. from 2009 apartment is predicted to be $ 258,330