In [1]:
%reset -f

# load all the Python packages for regression 

In [2]:
import numpy                 as np
import matplotlib.pyplot     as plt
import pandas                as pd
import statsmodels.api       as sm
import statsmodels.stats.api as sms

from statsmodels.iolib.summary2 import summary_col
from statsmodels.compat         import lzip

# now we load the data into Python

In [3]:
df = pd.read_csv("hprice1.csv")

# add the constant term into your dataset

In [4]:
df = sm.add_constant(df)

# to make sure the data is loaded correctly, check all the variable names in your file

In [5]:
df.columns

Index(['const', 'price', 'assess', 'bdrms', 'lotsize', 'sqrft', 'colonial',
       'lprice', 'lassess', 'llotsize', 'lsqrft'],
      dtype='object')

# to make sure the data is loaded correctly, check the summary statistics of each variable

In [6]:
df.describe()

Unnamed: 0,const,price,assess,bdrms,lotsize,sqrft,colonial,lprice,lassess,llotsize,lsqrft
count,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0
mean,1.0,293.546034,315.736362,3.568182,9019.863636,2013.693182,0.693182,5.63318,5.717994,8.905104,7.57261
std,0.0,102.713445,95.314435,0.841393,10174.150414,577.191583,0.463816,0.303573,0.262113,0.54406,0.258688
min,1.0,111.0,198.7,2.0,1000.0,1171.0,0.0,4.70953,5.291796,6.907755,7.065613
25%,1.0,230.0,253.900002,3.0,5732.75,1660.5,0.0,5.438079,5.53694,8.653908,7.414873
50%,1.0,265.5,290.199995,3.0,6430.0,1845.0,1.0,5.581613,5.670567,8.768719,7.520231
75%,1.0,326.25,352.125,4.0,8583.25,2227.0,1.0,5.787642,5.863982,9.057567,7.708266
max,1.0,725.0,708.59998,7.0,92681.0,3880.0,1.0,6.586172,6.563291,11.436919,8.263591


# all good, now we can compute regression models

---

# Q1.a

# $$ \mathrm{price} = \beta_0 + \beta_1 \cdot \mathrm{bdrms} + \beta_2 \cdot \mathrm{lotsize} + \beta_3 \cdot \mathrm{sqrft} + u $$

In [8]:
# define the X variable
x_a = df[['const','bdrms', 'lotsize', 'sqrft']]
# define the Y variable
y_a = df[['price']]

#define the regression model based on X and Y variables
M1a = sm.OLS(y_a,x_a)

#compute the regression model
M1a_result = M1a.fit()

#print the result
print(M1a_result.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.672
Model:                            OLS   Adj. R-squared:                  0.661
Method:                 Least Squares   F-statistic:                     57.46
Date:                Tue, 05 Oct 2021   Prob (F-statistic):           2.70e-20
Time:                        15:43:38   Log-Likelihood:                -482.88
No. Observations:                  88   AIC:                             973.8
Df Residuals:                      84   BIC:                             983.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -21.7703     29.475     -0.739      0.4

---

# Q1.b

# $$ \ln \left( \mathrm{price} \right) = \beta_0 + \beta_1 \cdot \mathrm{bdrms} + \beta_2 \cdot \ln \left( \mathrm{lotsize} \right) + \beta_3 \cdot \ln \left( \mathrm{sqrft} \right) + u $$

In [9]:
# define the X variable
x_b = df[['const','bdrms', 'llotsize', 'lsqrft']]
# define the Y variable
y_b = df[['lprice']]

#define the regression model based on X and Y variables
M1b = sm.OLS(y_b,x_b)

#compute the regression model
M1b_result = M1b.fit()

#print the result
print(M1b_result.summary())

                            OLS Regression Results                            
Dep. Variable:                 lprice   R-squared:                       0.643
Model:                            OLS   Adj. R-squared:                  0.630
Method:                 Least Squares   F-statistic:                     50.42
Date:                Tue, 05 Oct 2021   Prob (F-statistic):           9.74e-19
Time:                        15:44:03   Log-Likelihood:                 25.861
No. Observations:                  88   AIC:                            -43.72
Df Residuals:                      84   BIC:                            -33.81
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2970      0.651     -1.992      0.0

---

# Q1.c

# $$ \ln \left( \mathrm{price} \right) = \beta_0 + \beta_1 \cdot \mathrm{bdrms} + \beta_2 \cdot \ln \left( \mathrm{lotsize} \right) + \beta_3 \cdot \ln \left( \mathrm{sqrft} \right) + \beta_4 \cdot \ln \left( \mathrm{assess} \right) + u $$

In [11]:
# define the X variable
x_c = df[['const','bdrms', 'llotsize', 'lsqrft', 'lassess']]
# define the Y variable
y_c = df[['lprice']]

#define the regression model based on X and Y variables
M1c = sm.OLS(y_c,x_c)

#compute the regression model
M1c_result = M1c.fit()

#print the result
print(M1c_result.summary())

                            OLS Regression Results                            
Dep. Variable:                 lprice   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.762
Method:                 Least Squares   F-statistic:                     70.58
Date:                Tue, 05 Oct 2021   Prob (F-statistic):           6.45e-26
Time:                        15:44:33   Log-Likelihood:                 45.750
No. Observations:                  88   AIC:                            -81.50
Df Residuals:                      83   BIC:                            -69.11
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2637      0.570      0.463      0.6

---

# compare two models in the same regression table

In [13]:
summary = summary_col(
    [M1b_result,M1c_result], stars=True, float_format='%0.4f',
    model_names=['Model Q1(b)','Model Q1(c)'],
    info_dict={'sample size':lambda x: "{0:d}".format(int(x.nobs))})
print(summary)


               Model Q1(b) Model Q1(c)
--------------------------------------
R-squared      0.6430      0.7728     
R-squared Adj. 0.6302      0.7619     
bdrms          0.0370      0.0338     
               (0.0275)    (0.0221)   
const          -1.2970**   0.2637     
               (0.6513)    (0.5697)   
lassess                    1.0431***  
                           (0.1514)   
llotsize       0.1680***   0.0074     
               (0.0383)    (0.0386)   
lsqrft         0.7002***   -0.1032    
               (0.0929)    (0.1384)   
sample size    88          88         
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


---

# the changes after including $\mathrm{lassess}$ is due to the sample correlation between $\mathrm{lassess}$ and other X variables
# larger the absolute value of the correlation, larger the change
# you can check the sample correlation as follows

In [16]:
df[['lassess', 'bdrms', 'llotsize', 'lsqrft']].corr()

Unnamed: 0,lassess,bdrms,llotsize,lsqrft
lassess,1.0,0.458744,0.557735,0.864664
bdrms,0.458744,1.0,0.16949,0.519579
llotsize,0.557735,0.16949,1.0,0.311299
lsqrft,0.864664,0.519579,0.311299,1.0


# export codes into an HTML file

In [18]:
!rm -rf T7_Python.html
!jupyter nbconvert --to html T7_Python.ipynb

[NbConvertApp] Converting notebook T7_Python.ipynb to html
[NbConvertApp] Writing 607187 bytes to T7_Python.html
