In [3]:
import os
import pandas as pd
import stata_setup
from numpy import percentile
from tensorboard.notebook import display

## 设置pystata目录
stata_setup.config(os.getenv("STATA_SYSDIR"), 'mp')

In [18]:
from pystata import stata

## 加载数据
stata.run('use "simulation.dta", clear')

data = stata.pdataframe_from_data()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       10000 non-null  float64
 1   gender   10000 non-null  float64
 2   age      10000 non-null  float64
 3   age2     10000 non-null  float64
 4   hukou    10000 non-null  float64
 5   feduy    10000 non-null  float64
 6   meduy    10000 non-null  float64
 7   sibling  10000 non-null  float64
 8   luck1    10000 non-null  float64
 9   luck2    10000 non-null  float64
 10  luck3    10000 non-null  float64
 11  college  10000 non-null  float64
 12  lninc    10000 non-null  float64
dtypes: float64(13)
memory usage: 1015.8 KB


In [19]:
## 多元线性回归
stata.run('reg lninc college')
print('\n========================= 增加 控制变量 ==================================\n')
stata.run('reg lninc college gender age age2 hukou feduy meduy sibling, robust')


      Source |       SS           df       MS      Number of obs   =    10,000
-------------+----------------------------------   F(1, 9998)      =   1249.62
       Model |  1768.49262         1  1768.49262   Prob > F        =    0.0000
    Residual |  14149.4062     9,998  1.41522366   R-squared       =    0.1111
-------------+----------------------------------   Adj R-squared   =    0.1110
       Total |  15917.8988     9,999  1.59194908   Root MSE        =    1.1896

------------------------------------------------------------------------------
       lninc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   1.223779   .0346189    35.35   0.000     1.155919    1.291639
       _cons |   8.338612   .0128043   651.23   0.000     8.313513    8.363711
------------------------------------------------------------------------------



Linear regression                              

In [20]:
## 假设高考时的运气可观测，可使用其作为工具变量来识别大学对收入的影响
stata.run('corr luck1 luck2 luck3 college')

(obs=10,000)

             |    luck1    luck2    luck3  college
-------------+------------------------------------
       luck1 |   1.0000
       luck2 |  -0.0036   1.0000
       luck3 |   0.0031  -0.0087   1.0000
     college |   0.1841   0.1667   0.0232   1.0000



In [21]:
## 两阶段最小二乘法
stata.run('ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck1), vce(robust) first')


First-stage regressions
-----------------------

                                                        Number of obs = 10,000
                                                        F(8, 9991)    = 169.85
                                                        Prob > F      = 0.0000
                                                        R-squared     = 0.1564
                                                        Adj R-squared = 0.1557
                                                        Root MSE      = 0.3158

------------------------------------------------------------------------------
             |               Robust
     college | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
      gender |  -.0498054   .0064536    -7.72   0.000    -.0624558    -.037155
         age |   .0384085   .0030797    12.47   0.000     .0323716    .0444454
        age2 |  -.0005812   .0000491   -11.8

In [22]:
stata.run('ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck2), vce(robust) first')


First-stage regressions
-----------------------

                                                        Number of obs = 10,000
                                                        F(8, 9991)    = 167.65
                                                        Prob > F      = 0.0000
                                                        R-squared     = 0.1531
                                                        Adj R-squared = 0.1524
                                                        Root MSE      = 0.3164

------------------------------------------------------------------------------
             |               Robust
     college | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
      gender |  -.0517735    .006465    -8.01   0.000    -.0644462   -.0391008
         age |   .0392778   .0030852    12.73   0.000     .0332301    .0453255
        age2 |  -.0005956   .0000492   -12.1

In [23]:
stata.run('ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck3), vce(robust) first')


First-stage regressions
-----------------------

                                                        Number of obs = 10,000
                                                        F(8, 9991)    = 128.67
                                                        Prob > F      = 0.0000
                                                        R-squared     = 0.1254
                                                        Adj R-squared = 0.1247
                                                        Root MSE      = 0.3215

------------------------------------------------------------------------------
             |               Robust
     college | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
      gender |  -.0508021   .0065697    -7.73   0.000    -.0636801   -.0379241
         age |   .0386755   .0031176    12.41   0.000     .0325643    .0447867
        age2 |   -.000585   .0000497   -11.7

In [24]:
## 同时使用三个工具变量
stata.run('ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck1 luck2 luck3), vce(robust)')


Instrumental variables 2SLS regression            Number of obs   =     10,000
                                                  Wald chi2(8)    =    4001.37
                                                  Prob > chi2     =     0.0000
                                                  R-squared       =     0.3135
                                                  Root MSE        =     1.0454

------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .6703614    .120531     5.56   0.000     .4341249    .9065979
      gender |   .7589964   .0220328    34.45   0.000     .7158128    .8021799
         age |   .2821712   .0125903    22.41   0.000     .2574946    .3068478
        age2 |  -.0043363   .0001995   -21.73   0.000    -.0047273   -.0039452
       hukou |

In [25]:
## 使用广义矩估计法
stata.run('ivregress gmm lninc gender age age2 hukou feduy meduy sibling (college=luck1 luck2 luck3), vce(robust)')


Instrumental variables GMM regression             Number of obs   =     10,000
                                                  Wald chi2(8)    =    4001.06
                                                  Prob > chi2     =     0.0000
                                                  R-squared       =     0.3135
GMM weight matrix: Robust                         Root MSE        =     1.0454

------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .6697812   .1205048     5.56   0.000     .4335963    .9059662
      gender |   .7590011   .0220292    34.45   0.000     .7158246    .8021777
         age |   .2821799    .012581    22.43   0.000     .2575215    .3068383
        age2 |  -.0043368   .0001994   -21.75   0.000    -.0047276    -.003946
       hukou |

In [26]:
## 使用有限信息最大似然法
stata.run('ivregress liml lninc gender age age2 hukou feduy meduy sibling (college=luck1 luck2 luck3), vce(robust)')


Instrumental variables LIML regression            Number of obs   =     10,000
                                                  Wald chi2(8)    =    4001.31
                                                  Prob > chi2     =     0.0000
                                                  R-squared       =     0.3135
                                                  Root MSE        =     1.0454

------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .6702121   .1206029     5.56   0.000     .4338348    .9065893
      gender |   .7589887   .0220339    34.45   0.000     .7158031    .8021744
         age |    .282177   .0125913    22.41   0.000     .2574985    .3068555
        age2 |  -.0043363   .0001995   -21.73   0.000    -.0047274   -.0039452
       hukou |

In [32]:
## 弱工具变量检验，检查first_stage的Robust（或者Prob）以及最小特征值统计量（Minimum eigenvalue statistic）
stata.run(
    """
    qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck1), vce(robust)
    estat firststage, all forcenonrobust
    """
)


. 
.     qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (colle
> ge=luck1), vce(robust)

.     estat firststage, all forcenonrobust

  First-stage regression summary statistics
  --------------------------------------------------------------------------
               |            Adjusted      Partial       Robust
      Variable |   R-sq.       R-sq.        R-sq.     F(1,9991)   Prob > F
  -------------+------------------------------------------------------------
       college |  0.1564      0.1557       0.0359       355.785    0.0000
  --------------------------------------------------------------------------


  Shea's partial R-squared
  --------------------------------------------------
               |     Shea's             Shea's
      Variable |  partial R-sq.   adj. partial R-sq.
  -------------+------------------------------------
       college |     0.0359             0.0352
  --------------------------------------------------


  Minimum eigenvalue s

In [33]:
stata.run(
    """
    qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck2), vce(robust)
    estat firststage, all forcenonrobust
    """
)


. 
.     qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (colle
> ge=luck2), vce(robust)

.     estat firststage, all forcenonrobust

  First-stage regression summary statistics
  --------------------------------------------------------------------------
               |            Adjusted      Partial       Robust
      Variable |   R-sq.       R-sq.        R-sq.     F(1,9991)   Prob > F
  -------------+------------------------------------------------------------
       college |  0.1531      0.1524       0.0321        316.46    0.0000
  --------------------------------------------------------------------------


  Shea's partial R-squared
  --------------------------------------------------
               |     Shea's             Shea's
      Variable |  partial R-sq.   adj. partial R-sq.
  -------------+------------------------------------
       college |     0.0321             0.0314
  --------------------------------------------------


  Minimum eigenvalue s

In [34]:
stata.run(
    """
    qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck3), vce(robust)
    estat firststage, all forcenonrobust
    """
)


. 
.     qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (colle
> ge=luck3), vce(robust)

.     estat firststage, all forcenonrobust

  First-stage regression summary statistics
  --------------------------------------------------------------------------
               |            Adjusted      Partial       Robust
      Variable |   R-sq.       R-sq.        R-sq.     F(1,9991)   Prob > F
  -------------+------------------------------------------------------------
       college |  0.1254      0.1247       0.0004       4.11179    0.0426
  --------------------------------------------------------------------------


  Shea's partial R-squared
  --------------------------------------------------
               |     Shea's             Shea's
      Variable |  partial R-sq.   adj. partial R-sq.
  -------------+------------------------------------
       college |     0.0004             -0.0003
  --------------------------------------------------


  Minimum eigenvalue 

In [35]:
## 过度识别检验
stata.run(
    """
    qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck1 luck2 luck3), vce(robust)
    estat overid
    """
)


. 
.     qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (colle
> ge=luck1 luck2 luck3), vce(robust)

.     estat overid

  Test of overidentifying restrictions:

  Score chi2(2)          =  .439215  (p = 0.8028)

.     
. 


In [36]:
stata.run(
    """
    qui ivregress 2sls lninc gender hukou feduy meduy sibling (college=luck1 luck2 luck3 age age2), robust first
    estat overid
    """
)


. 
.     qui ivregress 2sls lninc gender hukou feduy meduy sibling (college=luck1 
> luck2 luck3 age age2), robust first

.     estat overid

  Test of overidentifying restrictions:

  Score chi2(4)          =  419.319  (p = 0.0000)

.     
. 


In [37]:
## 豪斯曼检验
stata.run(
    """
    qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (college=luck1 luck2 luck3), vce(robust)
    estat endogenous
    """
)


. 
.     qui ivregress 2sls lninc gender age age2 hukou feduy meduy sibling (colle
> ge=luck1 luck2 luck3), vce(robust)

.     estat endogenous

  Tests of endogeneity
  H0: Variables are exogenous

  Robust score chi2(1)            =   4.1346  (p = 0.0420)
  Robust regression F(1,9990)     =  4.13972  (p = 0.0419)

.     
. 


In [44]:
## 使用ivreg2方法
stata.run('ivreg2 lninc gender age age2 hukou feduy meduy sibling (college=luck1 luck2 luck3), robust')


IV (2SLS) estimation
--------------------

Estimates efficient for homoskedasticity only
Statistics robust to heteroskedasticity

                                                      Number of obs =    10000
                                                      F(  8,  9991) =   499.72
                                                      Prob > F      =   0.0000
Total (centered) SS     =  15917.89881                Centered R2   =   0.3135
Total (uncentered) SS   =    739442.52                Uncentered R2 =   0.9852
Residual SS             =  10927.87721                Root MSE      =    1.045

------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .6703614    .120531     5.56   0.000     .4341249    .9065979
      gender |   .7589964   .0220328    34

In [46]:
## 检验 age 和 age2 是否与其他三个变量作为工具变量的回归结果不同
stata.run('ivreg2 lninc gender hukou feduy meduy sibling (college=luck1 luck2 luck3 age age2), robust orthog(age age2)')


IV (2SLS) estimation
--------------------

Estimates efficient for homoskedasticity only
Statistics robust to heteroskedasticity

                                                      Number of obs =    10000
                                                      F(  6,  9993) =   513.72
                                                      Prob > F      =   0.0000
Total (centered) SS     =  15917.89881                Centered R2   =   0.2389
Total (uncentered) SS   =    739442.52                Uncentered R2 =   0.9836
Residual SS             =  12115.73885                Root MSE      =    1.101

------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   1.770388   .1193018    14.84   0.000     1.536561    2.004215
      gender |   .8226807   .0229812    35

In [47]:
## 检验 luck3 是否是冗余的工具变量
stata.run('ivreg2 lninc gender age age2 hukou feduy meduy sibling (college=luck1 luck2 luck3), robust redundant(luck3)')


IV (2SLS) estimation
--------------------

Estimates efficient for homoskedasticity only
Statistics robust to heteroskedasticity

                                                      Number of obs =    10000
                                                      F(  8,  9991) =   499.72
                                                      Prob > F      =   0.0000
Total (centered) SS     =  15917.89881                Centered R2   =   0.3135
Total (uncentered) SS   =    739442.52                Uncentered R2 =   0.9852
Residual SS             =  10927.87721                Root MSE      =    1.045

------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .6703614    .120531     5.56   0.000     .4341249    .9065979
      gender |   .7589964   .0220328    34

In [48]:
## 豪斯曼检验
stata.run('ivreg2 lninc gender age age2 hukou feduy meduy sibling (college=luck1 luck2 luck3), robust endog(college)')


IV (2SLS) estimation
--------------------

Estimates efficient for homoskedasticity only
Statistics robust to heteroskedasticity

                                                      Number of obs =    10000
                                                      F(  8,  9991) =   499.72
                                                      Prob > F      =   0.0000
Total (centered) SS     =  15917.89881                Centered R2   =   0.3135
Total (uncentered) SS   =    739442.52                Uncentered R2 =   0.9852
Residual SS             =  10927.87721                Root MSE      =    1.045

------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .6703614    .120531     5.56   0.000     .4341249    .9065979
      gender |   .7589964   .0220328    34