# install all packages we need

In [2]:
#clear your memory
rm(list = ls())

#update packages
update.packages(ask = FALSE)

ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg))
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}

packages <- c('AER','stargazer','readr','dplyr','car','Hmisc', 'ggplot2','lmtest')

ipak(packages)

# load all R regression packages

In [3]:
library(readr)
library(dplyr)
library(car)
library(Hmisc)
library(AER)
library(stargazer)
library(ggplot2)
library(lmtest)

# now we load the data into R

In [5]:
data <- read_csv("hprice1.csv")

[1m[1mRows: [1m[22m[34m[34m88[34m[39m [1m[1mColumns: [1m[22m[34m[34m10[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m (10): price, assess, bdrms, lotsize, sqrft, colonial, lprice, lassess, l...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



# to make sure the data is loaded correctly, check all the variable names in your file

In [23]:
colnames(data)

# to make sure the data is loaded correctly, check the summary statistics of each variable

In [6]:
summary(data)

     price           assess          bdrms          lotsize          sqrft     
 Min.   :111.0   Min.   :198.7   Min.   :2.000   Min.   : 1000   Min.   :1171  
 1st Qu.:230.0   1st Qu.:253.9   1st Qu.:3.000   1st Qu.: 5733   1st Qu.:1660  
 Median :265.5   Median :290.2   Median :3.000   Median : 6430   Median :1845  
 Mean   :293.5   Mean   :315.7   Mean   :3.568   Mean   : 9020   Mean   :2014  
 3rd Qu.:326.2   3rd Qu.:352.1   3rd Qu.:4.000   3rd Qu.: 8583   3rd Qu.:2227  
 Max.   :725.0   Max.   :708.6   Max.   :7.000   Max.   :92681   Max.   :3880  
    colonial          lprice         lassess         llotsize     
 Min.   :0.0000   Min.   :4.710   Min.   :5.292   Min.   : 6.908  
 1st Qu.:0.0000   1st Qu.:5.438   1st Qu.:5.537   1st Qu.: 8.654  
 Median :1.0000   Median :5.582   Median :5.671   Median : 8.769  
 Mean   :0.6932   Mean   :5.633   Mean   :5.718   Mean   : 8.905  
 3rd Qu.:1.0000   3rd Qu.:5.788   3rd Qu.:5.864   3rd Qu.: 9.058  
 Max.   :1.0000   Max.   :6.586   Max.

# all good, now we can compute regression models

---

# Q1.a

# $$ \mathrm{price} = \beta_0 + \beta_1 \cdot \mathrm{bdrms} + \beta_2 \cdot \mathrm{lotsize} + \beta_3 \cdot \mathrm{sqrft} + u $$

In [7]:
#define the regression model based on X and Y variables
M1_a = lm(price ~ bdrms + lotsize + sqrft, data=data)

#report the regression result
summary(M1_a)


Call:
lm(formula = price ~ bdrms + lotsize + sqrft, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-120.026  -38.530   -6.555   32.323  209.376 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.177e+01  2.948e+01  -0.739  0.46221    
bdrms        1.385e+01  9.010e+00   1.537  0.12795    
lotsize      2.068e-03  6.421e-04   3.220  0.00182 ** 
sqrft        1.228e-01  1.324e-02   9.275 1.66e-14 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 59.83 on 84 degrees of freedom
Multiple R-squared:  0.6724,	Adjusted R-squared:  0.6607 
F-statistic: 57.46 on 3 and 84 DF,  p-value: < 2.2e-16


# the default R table is horrible, let's report them in a nicer table called "stargazer"

In [13]:
stargazer(M1_a,
  header=FALSE, 
  type="text",  
  digits=4, 
  intercept.bottom=TRUE, 
  dep.var.caption="Y variable: price",
  model.names=FALSE,
  omit.stat=c("LL","ser","f"))


                  Y variable: price     
             ---------------------------
                        price           
----------------------------------------
bdrms                  13.8525          
                      (9.0101)          
                                        
lotsize               0.0021***         
                      (0.0006)          
                                        
sqrft                 0.1228***         
                      (0.0132)          
                                        
Constant              -21.7703          
                      (29.4750)         
                                        
----------------------------------------
Observations             88             
R2                     0.6724           
Adjusted R2            0.6607           
Note:        *p<0.1; **p<0.05; ***p<0.01


---

# Q1.b

# $$ \ln \left( \mathrm{price} \right) = \beta_0 + \beta_1 \cdot \mathrm{bdrms} + \beta_2 \cdot \ln \left( \mathrm{lotsize} \right) + \beta_3 \cdot \ln \left( \mathrm{sqrft} \right) + u $$

In [14]:
#define the regression model based on X and Y variables
M1_b = lm(lprice ~ bdrms + llotsize + lsqrft, data=data)

#report the regression result
summary(M1_b)


Call:
lm(formula = lprice ~ bdrms + llotsize + lsqrft, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.68422 -0.09178 -0.01584  0.11213  0.66899 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.29704    0.65128  -1.992   0.0497 *  
bdrms        0.03696    0.02753   1.342   0.1831    
llotsize     0.16797    0.03828   4.388 3.31e-05 ***
lsqrft       0.70023    0.09287   7.540 5.01e-11 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1846 on 84 degrees of freedom
Multiple R-squared:  0.643,	Adjusted R-squared:  0.6302 
F-statistic: 50.42 on 3 and 84 DF,  p-value: < 2.2e-16


# the default R table is horrible, let's report them in a nicer table called "stargazer"

In [16]:
stargazer(M1_b,
  header=FALSE, 
  type="text",  
  digits=4, 
  intercept.bottom=TRUE, 
  dep.var.caption="Y variable: lprice",
  model.names=FALSE,
  omit.stat=c("LL","ser","f"))


                 Y variable: lprice     
             ---------------------------
                       lprice           
----------------------------------------
bdrms                  0.0370           
                      (0.0275)          
                                        
llotsize              0.1680***         
                      (0.0383)          
                                        
lsqrft                0.7002***         
                      (0.0929)          
                                        
Constant              -1.2970**         
                      (0.6513)          
                                        
----------------------------------------
Observations             88             
R2                     0.6430           
Adjusted R2            0.6302           
Note:        *p<0.1; **p<0.05; ***p<0.01


---

# Q1.c

# $$ \ln \left( \mathrm{price} \right) = \beta_0 + \beta_1 \cdot \mathrm{bdrms} + \beta_2 \cdot \ln \left( \mathrm{lotsize} \right) + \beta_3 \cdot \ln \left( \mathrm{sqrft} \right) + \beta_4 \cdot \ln \left( \mathrm{assess} \right) + u $$

In [17]:
#define the regression model based on X and Y variables
M1_c = lm(lprice ~ bdrms + llotsize + lsqrft + lassess, data=data)

#report the regression result
summary(M1_c)


Call:
lm(formula = lprice ~ bdrms + llotsize + lsqrft + lassess, data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.53337 -0.06333  0.00686  0.07836  0.60825 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.263743   0.569665   0.463    0.645    
bdrms        0.033839   0.022098   1.531    0.129    
llotsize     0.007438   0.038561   0.193    0.848    
lsqrft      -0.103238   0.138430  -0.746    0.458    
lassess      1.043065   0.151446   6.887 1.01e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1481 on 83 degrees of freedom
Multiple R-squared:  0.7728,	Adjusted R-squared:  0.7619 
F-statistic: 70.58 on 4 and 83 DF,  p-value: < 2.2e-16


# the default R table is horrible, let's report them in a nicer table called "stargazer"

In [18]:
stargazer(M1_c,
  header=FALSE, 
  type="text",  
  digits=4, 
  intercept.bottom=TRUE, 
  dep.var.caption="Y variable: lprice",
  model.names=FALSE,
  omit.stat=c("LL","ser","f"))


                 Y variable: lprice     
             ---------------------------
                       lprice           
----------------------------------------
bdrms                  0.0338           
                      (0.0221)          
                                        
llotsize               0.0074           
                      (0.0386)          
                                        
lsqrft                 -0.1032          
                      (0.1384)          
                                        
lassess               1.0431***         
                      (0.1514)          
                                        
Constant               0.2637           
                      (0.5697)          
                                        
----------------------------------------
Observations             88             
R2                     0.7728           
Adjusted R2            0.7619           
Note:        *p<0.1; **p<0.05; ***p<0.01


---

# compare two models in the same regression table

In [22]:
stargazer(M1_b, M1_c,
    header=FALSE, 
    type="text",  
    digits=4, 
    intercept.bottom=TRUE, 
    dep.var.caption="Dependent variable: lwage",
    model.names=FALSE,
    omit.stat=c("LL","ser","f"))


              Dependent variable: lwage  
             ----------------------------
                        lprice           
                  (1)            (2)     
-----------------------------------------
bdrms            0.0370        0.0338    
                (0.0275)      (0.0221)   
                                         
llotsize       0.1680***       0.0074    
                (0.0383)      (0.0386)   
                                         
lsqrft         0.7002***       -0.1032   
                (0.0929)      (0.1384)   
                                         
lassess                       1.0431***  
                              (0.1514)   
                                         
Constant       -1.2970**       0.2637    
                (0.6513)      (0.5697)   
                                         
-----------------------------------------
Observations       88            88      
R2               0.6430        0.7728    
Adjusted R2      0.6302        0.

---

# the changes after including $\mathrm{lassess}$ is due to the sample correlation between $\mathrm{lassess}$ and other X variables
# larger the absolute value of the correlation, larger the change
# you can check the sample correlation as follows

In [39]:
cor(data[c(8,3,9,10)])

Unnamed: 0,lassess,bdrms,llotsize,lsqrft
lassess,1.0,0.4587439,0.5577346,0.8646643
bdrms,0.4587439,1.0,0.1694903,0.5195793
llotsize,0.5577346,0.1694903,1.0,0.3112993
lsqrft,0.8646643,0.5195793,0.3112993,1.0
