In [35]:
library(tidyverse) ;
library(foreign) ; # stata data 
library(car) ; # companion to regressions
library(lmtest) ; # white test 
library(sandwich) ; # robust errors
library(MASS) ; # regression

# Model interpretation

In [4]:
ceosal1 <- read.dta("/Users/ama/OneDrive - usp.br/S3/Natalia/Lista 2/CEOSAL1.DTA")

“cannot read factor labels from Stata 5 files”


## CEOSAL1.RAW
The original model without the dummy variable reported the the `ros` variable as insignificant. Once we add the dummy variable `rosneg` to the model, both `ros` and `rosneg` appeared statistically significant.

In [5]:
# create the dummy variable
# where ros < 0 is 0
# and ros > 1 is 1

ceosal1 <- ceosal1 %>%
mutate(rosneg = ifelse(ros > 0, 1, 0))

head(ceosal1)

Unnamed: 0_level_0,salary,pcsalary,sales,roe,pcroe,ros,indus,finance,consprod,utility,lsalary,lsales,rosneg
Unnamed: 0_level_1,<int>,<int>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>
1,1095,20,27595.0,14.1,106.4,191,1,0,0,0,6.998509,10.225389,1
2,1001,32,9958.0,10.9,-30.6,13,1,0,0,0,6.908755,9.206132,1
3,1122,9,6125.9,23.5,-16.3,14,1,0,0,0,7.022868,8.720281,1
4,578,-9,16246.0,5.9,-25.7,-21,1,0,0,0,6.359574,9.695602,0
5,1368,7,21783.2,13.8,-3.0,56,1,0,0,0,7.221105,9.988894,1
6,1145,5,6021.4,20.0,1.0,55,1,0,0,0,7.04316,8.703075,1


In [6]:
# create new model with the dummy variable

ceosal_lm <- lm(log(salary) ~ log(sales) + roe + rosneg, data = ceosal1)

summary(ceosal_lm)


Call:
lm(formula = log(salary) ~ log(sales) + roe + rosneg, data = ceosal1)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.96372 -0.27290 -0.04733  0.21400  2.76903 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4.071927   0.323716  12.579  < 2e-16 ***
log(sales)  0.288387   0.033617   8.579 2.37e-15 ***
roe         0.016657   0.003968   4.198 4.02e-05 ***
rosneg      0.225675   0.109338   2.064   0.0403 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4785 on 205 degrees of freedom
Multiple R-squared:  0.2966,	Adjusted R-squared:  0.2863 
F-statistic: 28.81 on 3 and 205 DF,  p-value: 1.375e-15


## LOANAPP.RAW

In [7]:
loanapp <- read.dta("/Users/ama/OneDrive - usp.br/S3/Natalia/Lista 2/loanapp.dta")

head(loanapp)

Unnamed: 0_level_0,occ,loanamt,action,msa,suffolk,appinc,typur,unit,married,dep,⋯,approve,mortno,mortperf,mortlat1,mortlat2,chist,multi,loanprc,thick,white
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>
1,1,89,1,1120,0,72,0,1,0,0,⋯,1,0,1,0,0,1,0,0.7542373,0,1
2,1,128,3,1120,0,74,0,1,1,1,⋯,0,0,1,0,0,1,0,0.8,1,1
3,1,128,1,1120,0,84,3,1,0,0,⋯,1,0,1,0,0,1,0,0.8951049,1,1
4,1,66,1,1120,0,36,0,1,1,0,⋯,1,0,1,0,0,0,0,0.6,0,1
5,1,120,1,1120,0,59,8,1,1,0,⋯,1,0,1,0,0,1,0,0.8955224,0,1
6,1,111,1,1120,0,63,9,1,0,0,⋯,1,0,1,0,0,0,0,0.8043478,0,1


### 
If there is discrimination against nonwhites, the sign of $\beta_1$ should be positive because it means they have a greater chance of getting approved for a loan.

### 

The coefficient of white in the model means that a loan application submitted by a white person in the sample was about 20% more likely than a nonwhite person to be approved. The small p-value (or large t statistic) indicate that this variable is very statistically significant. Moreover, 20% is a large percentual advantage, especially if considered on a national level, which we can do considering the sample size above 1000. 

In [8]:
# model loan approval with white or nonwhite variable
summary(
    
    lm(approve ~ white, 
       
       data = loanapp)
    
)


Call:
lm(formula = approve ~ white, data = loanapp)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.90839  0.09161  0.09161  0.09161  0.29221 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.70779    0.01824   38.81   <2e-16 ***
white        0.20060    0.01984   10.11   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3201 on 1987 degrees of freedom
Multiple R-squared:  0.04893,	Adjusted R-squared:  0.04845 
F-statistic: 102.2 on 1 and 1987 DF,  p-value: < 2.2e-16


### 

The added variables control for other differences within the population, which help explain the variation within the approval rates. However, even when controling for these differences, the white coefficient remains statistically significant and relatively large (around 13%). The next step in building this model would be to add interaction variables to see the advantages of certain groups of whites. 

In [9]:
summary(
    
    lm(approve ~ white + hrat + obrat + loanprc + unem + male + married + dep + sch + cosign + chist + pubrec + mortlat1 + mortlat2 + vr, 
       
       data = loanapp)

)


Call:
lm(formula = approve ~ white + hrat + obrat + loanprc + unem + 
    male + married + dep + sch + cosign + chist + pubrec + mortlat1 + 
    mortlat2 + vr, data = loanapp)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.06482  0.00781  0.06387  0.13673  0.71105 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.936731   0.052735  17.763  < 2e-16 ***
white        0.128820   0.019732   6.529 8.44e-11 ***
hrat         0.001833   0.001263   1.451   0.1469    
obrat       -0.005432   0.001102  -4.930 8.92e-07 ***
loanprc     -0.147300   0.037516  -3.926 8.92e-05 ***
unem        -0.007299   0.003198  -2.282   0.0226 *  
male        -0.004144   0.018864  -0.220   0.8261    
married      0.045824   0.016308   2.810   0.0050 ** 
dep         -0.006827   0.006701  -1.019   0.3084    
sch          0.001753   0.016650   0.105   0.9162    
cosign       0.009772   0.041139   0.238   0.8123    
chist        0.133027   0.019263   6.906 6.72e-12 ***
pu

### 

In adding the `white*obrat` interaction, we can see that loan applications by whites are less affected by having other income obligations. Although the advantage is slight, it is still statistically significant, as seen by the very small p-value of the interaction variable. 

In [10]:
summary( # adding the interaction term between race and percentage of income 
    
    lm(approve ~ white*obrat + hrat + loanprc + unem + male + married + dep + sch + cosign + chist + pubrec + mortlat1 + mortlat2 + vr, 
       
       data = loanapp)

)


Call:
lm(formula = approve ~ white * obrat + hrat + loanprc + unem + 
    male + married + dep + sch + cosign + chist + pubrec + mortlat1 + 
    mortlat2 + vr, data = loanapp)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.05523  0.01253  0.06320  0.12692  0.83284 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.180648   0.086808  13.601  < 2e-16 ***
white       -0.145975   0.080263  -1.819 0.069109 .  
obrat       -0.012226   0.002216  -5.518 3.88e-08 ***
hrat         0.001790   0.001260   1.421 0.155521    
loanprc     -0.152536   0.037436  -4.075 4.79e-05 ***
unem        -0.007528   0.003189  -2.360 0.018352 *  
male        -0.006015   0.018817  -0.320 0.749241    
married      0.045536   0.016260   2.800 0.005154 ** 
dep         -0.007630   0.006686  -1.141 0.253905    
sch          0.001777   0.016601   0.107 0.914787    
cosign       0.017709   0.041081   0.431 0.666458    
chist        0.129855   0.019227   6.754 1.90e-11 ***
pu

# Specification problems and proxy variables

## RESET

### 

The RESET test is to determine if a model has functional form misspecification. In this case, the test essentially tells us if the squared or cubed values of the variables are statistically significant and should be included in the model. The F Statistic here is 1.33 with a p-value of about 0.27, meaning that functional misspecification is not an issue in this model. 

In [11]:
# reset test 
resettest(ceosal_lm, power = 2:3, type = "fitted")


	RESET test

data:  ceosal_lm
RESET = 1.3335, df1 = 2, df2 = 203, p-value = 0.2658


### 

In [47]:
# unsure why these are the same results as the reset test with the normal standard errors
# the rlm gives the robust standard errors

ceosal_rlm <- rlm(log(salary) ~ log(sales) + roe + rosneg, data = ceosal1)

# these should be the robust standard errors?
summary(ceosal_rlm)


Call: rlm(formula = log(salary) ~ log(sales) + roe + rosneg, data = ceosal1)
Residuals:
     Min       1Q   Median       3Q      Max 
-0.92918 -0.23297 -0.01297  0.24630  2.80142 

Coefficients:
            Value   Std. Error t value
(Intercept)  4.1440  0.2689    15.4132
log(sales)   0.2812  0.0279    10.0698
roe          0.0167  0.0033     5.0673
rosneg       0.1758  0.0908     1.9363

Residual standard error: 0.3652 on 205 degrees of freedom

In [48]:
# reset test gives the same results though...
resettest(ceosal_rlm, power = 2:3, type = "fitted")


	RESET test

data:  ceosal_rlm
RESET = 1.3335, df1 = 2, df2 = 203, p-value = 0.2658


## WAGE2.RAW

In [13]:
wage2 <- read.dta("/Users/ama/OneDrive - usp.br/S3/Natalia/Lista 1/WAGE2.DTA")

head(wage2)

“cannot read factor labels from Stata 5 files”


Unnamed: 0_level_0,wage,hours,IQ,KWW,educ,exper,tenure,age,married,black,south,urban,sibs,brthord,meduc,feduc,lwage
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>
1,769,40,93,35,12,11,2,31,1,0,0,1,1,2.0,8,8.0,6.645091
2,808,50,119,41,18,11,16,37,1,0,0,1,1,,14,14.0,6.694562
3,825,40,108,46,14,11,9,33,1,0,0,1,1,2.0,14,14.0,6.715384
4,650,40,96,32,12,13,7,32,1,0,0,1,4,3.0,12,12.0,6.476973
5,562,40,74,27,11,14,5,34,1,0,0,1,10,6.0,6,11.0,6.331502
6,1400,40,116,43,16,14,2,35,1,1,0,1,1,2.0,8,,7.244227


### 

In [14]:
summary(
    
    lm(log(wage) ~ KWW + educ + exper + tenure + married + black + south + urban, data = wage2)

)


Call:
lm(formula = log(wage) ~ KWW + educ + exper + tenure + married + 
    black + south + urban, data = wage2)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.04494 -0.21931 -0.00048  0.24163  1.26464 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  5.358797   0.113600  47.172  < 2e-16 ***
KWW          0.005028   0.001819   2.764 0.005820 ** 
educ         0.057628   0.006838   8.428  < 2e-16 ***
exper        0.012228   0.003241   3.773 0.000172 ***
tenure       0.011072   0.002456   4.507 7.40e-06 ***
married      0.189461   0.039077   4.848 1.46e-06 ***
black       -0.164267   0.038530  -4.263 2.22e-05 ***
south       -0.091601   0.026156  -3.502 0.000484 ***
urban        0.175545   0.027032   6.494 1.36e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3642 on 926 degrees of freedom
Multiple R-squared:  0.2587,	Adjusted R-squared:  0.2523 
F-statistic: 40.39 on 8 and 926 DF,  p-valu

### 

In [15]:
wage_lm <- lm(log(wage) ~ KWW + IQ + educ + exper + tenure + married + black + south + urban, data = wage2)

summary(wage_lm)


Call:
lm(formula = log(wage) ~ KWW + IQ + educ + exper + tenure + married + 
    black + south + urban, data = wage2)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.05704 -0.21621  0.00824  0.23725  1.24895 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  5.175644   0.127776  40.506  < 2e-16 ***
KWW          0.003826   0.001852   2.066  0.03913 *  
IQ           0.003118   0.001013   3.079  0.00214 ** 
educ         0.049837   0.007262   6.863 1.24e-11 ***
exper        0.012752   0.003231   3.947 8.51e-05 ***
tenure       0.010925   0.002446   4.467 8.92e-06 ***
married      0.192145   0.038909   4.938 9.35e-07 ***
black       -0.130399   0.039901  -3.268  0.00112 ** 
south       -0.082029   0.026222  -3.128  0.00181 ** 
urban        0.175823   0.026910   6.534 1.06e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3625 on 925 degrees of freedom
Multiple R-squared:  0.2662,	Adjusted R-sq

### 

In [16]:
# testing joint significance

linearHypothesis(wage_lm, c("KWW", "IQ"))

Unnamed: 0_level_0,Res.Df,RSS,Df,Sum of Sq,F,Pr(>F)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,927,123.8185,,,,
2,925,121.5595,2.0,2.259033,8.594991,0.0002002182


# OLS and Probit

## PNTSPRD.RAW

In [17]:
pntsprd <- read.dta("/Users/ama/OneDrive - usp.br/S3/Natalia/Lista 2/PNTSPRD.DTA")

head(pntsprd)

“cannot read factor labels from Stata 5 files”


Unnamed: 0_level_0,favscr,undscr,spread,favhome,neutral,fav25,und25,fregion,uregion,scrdiff,sprdcvr,favwin
Unnamed: 0_level_1,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,72,61,7.0,0,0,1,0,3,4,11,1,1
2,82,74,7.0,1,0,0,0,3,1,8,1,1
3,87,57,17.0,1,0,0,0,3,3,30,1,1
4,69,70,9.0,1,0,0,0,3,3,-1,0,0
5,77,79,2.5,0,0,0,0,2,3,-2,0,0
6,91,65,9.0,0,1,1,0,3,4,26,1,1


### 
We would expect $\beta_0 = 0.5$ because if all the relevant information is equal to zero, then the team would have a 50% chance of winning. 

### 

In [18]:
pntsprd_lm <- lm(favwin ~ spread, data = pntsprd)

# the model
summary(pntsprd_lm)


Call:
lm(formula = favwin ~ spread, data = pntsprd)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.9836 -0.1192  0.1519  0.3069  0.4037 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.576949   0.028235  20.434  < 2e-16 ***
spread      0.019366   0.002339   8.281 9.32e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4017 on 551 degrees of freedom
Multiple R-squared:  0.1107,	Adjusted R-squared:  0.1091 
F-statistic: 68.57 on 1 and 551 DF,  p-value: 9.324e-16


In [19]:
# robust t test
coeftest(pntsprd_lm, vcov = vcovHC(pntsprd_lm, type = "HC0"))

# robust standard errors


t test of coefficients:

             Estimate Std. Error t value  Pr(>|t|)    
(Intercept) 0.5769492  0.0315995  18.258 < 2.2e-16 ***
spread      0.0193655  0.0019184  10.095 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


Testing $H_0 : \beta_0 = 0.5$ with standard errors and  heteroskedasticity-robust standard errors as calculated above. 

With the regular standard error, the t statistic for the null hypothesis is: 

In [20]:
(0.577 - .5) / 0.028

This leads us to reject the null hypothesis with a two-sided alternative at a 1% confidence level, as the absolute value of the t statistic is larger than the critical value 2.58. 

With the robust standard errors, the t statistic for the null hypothesis is: 

In [21]:
(0.577 - 0.5) / 0.032

Similarly, we can reject the null hypothesis with a two-sided alternative at the 2% confidence level, as the absolute value of the t statistic is above the criticual value 2.33. 

### 
Yes, _spread_ is statistically significant with normal or robust standard error values and a p-value less than 0.001 in both cases. 

In [22]:
# if spread = 10

intercept_pntsprd <- coef(pntsprd_lm)[1] 

coef_pntsprd <- coef(pntsprd_lm)[2]


intercept_pntsprd + coef_pntsprd * 10

### 

In [23]:
pntsprd_probit <- glm(favwin ~ spread, 
                      family = binomial(link = "probit"),
                      data = pntsprd)

coeftest(pntsprd_probit, vcov. = vcovHC, type = "HC1")


z test of coefficients:

             Estimate Std. Error z value  Pr(>|z|)    
(Intercept) -0.010592   0.101788 -0.1041    0.9171    
spread       0.092463   0.011612  7.9626 1.685e-15 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


In [24]:
# test the null hypothesis that the intercept is 0

### 

The predicted value when _spread_ = 10 is around 0.820, which is a bit higher than the LPM estimate. 

In [25]:
intercept_pntsprd <- coef(pntsprd_probit)[1]

coef1_pntsprd <- coef(pntsprd_probit)[2] 


# where pnorm is the cumulative standard normal distribution function 
pnorm( intercept_pntsprd + coef1_pntsprd * 10 )

###

When added variables are tested for joint significance, the log likelihood is about -262.6418, the likelihood ration statistic is 1.84, and the p-value is around 0.61, making the joint significance of these variables statistically insignificant. This means that once we add the variable `spread` to our model, the new variables do not help with making predictions. 

In [26]:
pntsprd_probit2 <- glm(favwin ~ spread + favhome + fav25 + und25, 
                      family = binomial(link = "probit"),
                      data = pntsprd)

coeftest(pntsprd_probit2, vcov. = vcovHC, type = "HC1")


z test of coefficients:

              Estimate Std. Error z value  Pr(>|z|)    
(Intercept) -0.0551800  0.1330021 -0.4149    0.6782    
spread       0.0878844  0.0116031  7.5742 3.613e-14 ***
favhome      0.1485755  0.1362605  1.0904    0.2755    
fav25        0.0030685  0.1613817  0.0190    0.9848    
und25       -0.2198086  0.2557421 -0.8595    0.3901    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


In [27]:
# test the joint significance of these added variables

lrtest(pntsprd_probit, pntsprd_probit2)

Unnamed: 0_level_0,#Df,LogLik,Df,Chisq,Pr(>Chisq)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2,-263.5622,,,
2,5,-262.6418,3.0,1.840843,0.6060875


# Probit and Logit

## LOANAPP.RAW

In [28]:
loanapp <- read.dta("/Users/ama/OneDrive - usp.br/S3/Natalia/Lista 2/loanapp.dta")

head(loanapp)

Unnamed: 0_level_0,occ,loanamt,action,msa,suffolk,appinc,typur,unit,married,dep,⋯,approve,mortno,mortperf,mortlat1,mortlat2,chist,multi,loanprc,thick,white
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>
1,1,89,1,1120,0,72,0,1,0,0,⋯,1,0,1,0,0,1,0,0.7542373,0,1
2,1,128,3,1120,0,74,0,1,1,1,⋯,0,0,1,0,0,1,0,0.8,1,1
3,1,128,1,1120,0,84,3,1,0,0,⋯,1,0,1,0,0,1,0,0.8951049,1,1
4,1,66,1,1120,0,36,0,1,1,0,⋯,1,0,1,0,0,0,0,0.6,0,1
5,1,120,1,1120,0,59,8,1,1,0,⋯,1,0,1,0,0,1,0,0.8955224,0,1
6,1,111,1,1120,0,63,9,1,0,0,⋯,1,0,1,0,0,0,0,0.8043478,0,1


### 

The estimated probability of loan approval for whites is the proporiton of loans approved for whites, which is 0.908; for nonwhites it is 0.708. 

In [29]:
 # number of loan approvals for whites 
1527 / 1681 # number of applications by whites

In [30]:
# number of loan approvals for nonwhites 
218 / 308 # number of applications by nonwhites

In [31]:
# estimate a probit model of approve on white 
loanapp_probit <- glm(approve ~ white, 
                     family = binomial(link = "probit"),
                     data = loanapp)

coeftest(loanapp_probit)


z test of coefficients:

            Estimate Std. Error z value  Pr(>|z|)    
(Intercept) 0.546946   0.075435  7.2506  4.15e-13 ***
white       0.783946   0.086711  9.0409 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


### 

With the new variables, loan approval for whites is 0.520, with a highly significant p-value, meaning there is evidence of discrimination against nonwhites. 

In [32]:
# adding new variables to the model
loanapp_probit2 <- glm(approve ~ white + hrat + obrat + loanprc + unem + male + married + dep + sch + cosign + chist + pubrec + mortlat1 + mortlat2 + vr, 
                     family = binomial(link = "probit"),
                     data = loanapp)

coeftest(loanapp_probit2)


z test of coefficients:

              Estimate Std. Error z value  Pr(>|z|)    
(Intercept)  2.0623295  0.3163084  6.5200 7.031e-11 ***
white        0.5202538  0.0968656  5.3709 7.835e-08 ***
hrat         0.0078763  0.0070259  1.1210  0.262271    
obrat       -0.0276926  0.0061474 -4.5048 6.644e-06 ***
loanprc     -1.0119556  0.2402648 -4.2118 2.533e-05 ***
unem        -0.0366854  0.0176800 -2.0750  0.037990 *  
male        -0.0370005  0.1098826 -0.3367  0.736322    
married      0.2657452  0.0947243  2.8055  0.005024 ** 
dep         -0.0495750  0.0390649 -1.2690  0.204427    
sch          0.0146483  0.0954148  0.1535  0.877987    
cosign       0.0860643  0.2408862  0.3573  0.720881    
chist        0.5852778  0.0956020  6.1220 9.240e-10 ***
pubrec      -0.7787419  0.1269841 -6.1326 8.646e-10 ***
mortlat1    -0.1876279  0.2571642 -0.7296  0.465633    
mortlat2    -0.4943544  0.3258825 -1.5170  0.129274    
vr          -0.2010617  0.0814776 -2.4677  0.013599 *  
---
Signif. codes:  0 

### 

In the logit model, the coefficient for loan approval for whites is now 0.938 and remains highly significant. 

In [33]:
# use logit instead of probit 
loanapp_logit <- glm(approve ~ white + hrat + obrat + loanprc + unem + male + married + dep + sch + cosign + chist + pubrec + mortlat1 + mortlat2 + vr, 
                     family = binomial(link = "logit"),
                     data = loanapp)

coeftest(loanapp_logit)


z test of coefficients:

             Estimate Std. Error z value  Pr(>|z|)    
(Intercept)  3.801710   0.594675  6.3929 1.627e-10 ***
white        0.937764   0.172901  5.4237 5.838e-08 ***
hrat         0.013263   0.012880  1.0298  0.303125    
obrat       -0.053034   0.011280 -4.7016 2.581e-06 ***
loanprc     -1.904951   0.460407 -4.1375 3.511e-05 ***
unem        -0.066579   0.032808 -2.0294  0.042421 *  
male        -0.066385   0.206423 -0.3216  0.747758    
married      0.503282   0.177993  2.8275  0.004691 ** 
dep         -0.090734   0.073332 -1.2373  0.215977    
sch          0.041229   0.178399  0.2311  0.817234    
cosign       0.132059   0.446080  0.2960  0.767197    
chist        1.066577   0.171208  6.2297 4.673e-10 ***
pubrec      -1.340665   0.217362 -6.1679 6.921e-10 ***
mortlat1    -0.309882   0.463510 -0.6686  0.503780    
mortlat2    -0.894675   0.568570 -1.5736  0.115591    
vr          -0.349828   0.153721 -2.2757  0.022862 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 

### 

To make probit and logit coefficients comprable, we can multiply the logit coefficient by 0.625, making the scaled coefficient of this model 0.586. This is similar to the 0.520 of the probit model. 

In [34]:
# white approval coefficient of the logit model scaled for comaprison with the probit model
0.937764 * 0.625