In [4]:
#Importing relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_regression

In [5]:
#loading data
raw_data = pd.read_csv('Data/real_estate_price_size_year_view.csv')

In [6]:
data = raw_data.copy()
data.head()

Unnamed: 0,price,size,year,view
0,234314.144,643.09,2015,No sea view
1,228581.528,656.22,2009,No sea view
2,281626.336,487.29,2018,Sea view
3,401255.608,1504.75,2015,No sea view
4,458674.256,1275.46,2009,Sea view


In [7]:
#creating dummy variables
data['view']=data['view'].map({'No sea view':0,'Sea view':1})
data.head()

Unnamed: 0,price,size,year,view
0,234314.144,643.09,2015,0
1,228581.528,656.22,2009,0
2,281626.336,487.29,2018,1
3,401255.608,1504.75,2015,0
4,458674.256,1275.46,2009,1


In [8]:
#selecting independent variables
x = data[['size','year','view']]
x.head()

Unnamed: 0,size,year,view
0,643.09,2015,0
1,656.22,2009,0
2,487.29,2018,1
3,1504.75,2015,0
4,1275.46,2009,1


In [9]:
#target variable
y = data['price']
y.head()

0    234314.144
1    228581.528
2    281626.336
3    401255.608
4    458674.256
Name: price, dtype: float64

In [10]:
#regression
reg =LinearRegression()

In [11]:
reg.fit(x,y)

LinearRegression()

In [12]:
#Coefficients and intercept
reg.coef_, reg.intercept_

(array([  223.03161944,  2718.94888864, 56726.01979839]), -5397914.181560063)

In [13]:
#R-squared
reg.score(x,y)

0.9128639058979645

In [14]:
#F-statistics
f_regression(x,y)

(array([285.92105192,   0.85525799,  20.25908753]),
 array([8.12763222e-31, 3.57340758e-01, 1.86445030e-05]))

In [15]:
#f stats and pvales extractions
f_statistics = f_regression(x,y)[0]
p_value = f_regression(x,y)[1]
f_statistics,p_value

(array([285.92105192,   0.85525799,  20.25908753]),
 array([8.12763222e-31, 3.57340758e-01, 1.86445030e-05]))

In [16]:
#regression summary dataframe
reg_summary = pd.DataFrame([['Size'],['Year'],['View']], columns = ['Features'])

In [17]:
reg_summary

Unnamed: 0,Features
0,Size
1,Year
2,View


In [18]:
#variable coef.
reg_summary['Coefficients'] = reg.coef_
reg_summary

Unnamed: 0,Features,Coefficients
0,Size,223.031619
1,Year,2718.948889
2,View,56726.019798


In [19]:
#pvalues
reg_summary['P-Value'] = p_value
reg_summary

Unnamed: 0,Features,Coefficients,P-Value
0,Size,223.031619,8.127632000000001e-31
1,Year,2718.948889,0.3573408
2,View,56726.019798,1.86445e-05


In [20]:
reg_summary['F-statistic'] = f_statistics
reg_summary

Unnamed: 0,Features,Coefficients,P-Value,F-statistic
0,Size,223.031619,8.127632000000001e-31,285.921052
1,Year,2718.948889,0.3573408,0.855258
2,View,56726.019798,1.86445e-05,20.259088


In [21]:
#qdding r squared
reg_summary['R-squared'] = [reg.score(x,y),0,0]
reg_summary

Unnamed: 0,Features,Coefficients,P-Value,F-statistic,R-squared
0,Size,223.031619,8.127632000000001e-31,285.921052,0.912864
1,Year,2718.948889,0.3573408,0.855258,0.0
2,View,56726.019798,1.86445e-05,20.259088,0.0


In [22]:
reg_summary = reg_summary.drop(['P-Value'], axis=1)

In [23]:
reg_summary

Unnamed: 0,Features,Coefficients,F-statistic,R-squared
0,Size,223.031619,285.921052,0.912864
1,Year,2718.948889,0.855258,0.0
2,View,56726.019798,20.259088,0.0


In [24]:
reg_summary['P-Value'] = p_value.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,F-statistic,R-squared,P-Value
0,Size,223.031619,285.921052,0.912864,0.0
1,Year,2718.948889,0.855258,0.0,0.357
2,View,56726.019798,20.259088,0.0,0.0


In [25]:
reg_summary['Intercept'] = [reg.intercept_,0,0]

In [26]:
reg_summary

Unnamed: 0,Features,Coefficients,F-statistic,R-squared,P-Value,Intercept
0,Size,223.031619,285.921052,0.912864,0.0,-5397914.0
1,Year,2718.948889,0.855258,0.0,0.357,0.0
2,View,56726.019798,20.259088,0.0,0.0,0.0
