In [2]:
import os
from pydoc import describe

import pandas as pd
import numpy as np
import stata_setup

## 设置pystata目录
stata_setup.config(os.getenv("STATA_SYSDIR"), 'mp')

In [9]:
%%stata

use "cfps2010.dta", clear
// 查看变量
describe


. 
. use "cfps2010.dta", clear

. // 查看变量
. describe

Contains data from cfps2010.dta
 Observations:         4,137                  
    Variables:            12                  5 Aug 2022 19:25
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
pid             double  %12.0g     pid        个人id
provcd          double  %24.0g     provcd     省国标码
gender          double  %12.0g     gender     性别
age             float   %9.0g                 年龄
age2            float   %9.0g                 年龄平方
age3            float   %9.0g                 年龄三次方
lninc           float   %9.0g                 收入对数
college         double  %9.0g      yesorno    是否上大学
hukou           double  %12.0g     hukou      3岁时户口性质
sibling         float   %9.0g      yesorno    是否独生子女
race            double  %9.0

In [None]:
from pystata import stata

## 一元线性回归
stata.run("reg lninc college")

In [12]:
%%stata

// 异方差稳健标准误
reg lninc college, vce(robust)
// 聚类稳健标准误
reg lninc college, vce(cluster provcd)


. 
. // 异方差稳健标准误
. reg lninc college, vce(robust)

Linear regression                               Number of obs     =      4,137
                                                F(1, 4135)        =     582.05
                                                Prob > F          =     0.0000
                                                R-squared         =     0.1095
                                                Root MSE          =     1.1498

------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |    .823612   .0341385    24.13   0.000     .7566823    .8905418
       _cons |   9.353189   .0256882   364.10   0.000     9.302826    9.403552
------------------------------------------------------------------------------

. //聚类稳健标准误
. reg lninc college, vce(clus

一元线性回归无法满足严格的可忽略性假定（要求大学学历在研究对象中必须随机分配），不能进行因果推断。
因此，上述结果仅能代表上大学和不上大学的人在收入对数的平均差异（相关性说明），不能解释为大学对收入的因果影响

In [14]:
%%stata
// 多元线性回归（户口是一个混杂变量）
reg lninc college hukou, vce(cluster provcd)


. // 多元线性回归（户口是一个混杂变量）
. reg lninc college hukou, vce(cluster provcd)

Linear regression                               Number of obs     =      4,137
                                                F(2, 24)          =     219.69
                                                Prob > F          =     0.0000
                                                R-squared         =     0.1169
                                                Root MSE          =     1.1451

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |    .798281   .0451727    17.67   0.000     .7050491     .891513
       hukou |   .2155333   .0611855     3.52   0.002     .0892526    .3418141
       _cons |    9.27

In [15]:
%%stata

// 获取残差
reg college hukou, vce(cluster provcd)
predict e, residuals

// 残差回归
reg lninc e, vce(cluster provcd)


. 
. // 获取残差
. reg college hukou, vce(cluster provcd)

Linear regression                               Number of obs     =      4,137
                                                F(1, 24)          =      27.84
                                                Prob > F          =     0.0000
                                                R-squared         =     0.0137
                                                Root MSE          =     .48606

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
     college | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
       hukou |    .116539   .0220861     5.28   0.000     .0709555    .1621224
       _cons |   .3496532   .0161816    21.61   0.000     .3162561    .3830504
--------------------------------------

In [17]:
%stata dis .823612-.798281

.025331


In [19]:
%stata reg hukou college, vce(cluster provcd)


Linear regression                               Number of obs     =      4,137
                                                F(1, 24)          =      33.17
                                                Prob > F          =     0.0000
                                                R-squared         =     0.0137
                                                Root MSE          =     .48812

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
       hukou | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .1175271   .0204066     5.76   0.000     .0754099    .1596443
       _cons |   .3608661   .0559186     6.45   0.000     .2454558    .4762764
------------------------------------------------------------------------------


In [21]:
%stata dis .1175271*.2155333

.025331


In [22]:
%%stata

// 纳入更多控制变量，需要进行饱和回归
reg lninc college hukou##i.age, vce(cluster provcd)
estat ic


. 
. // 纳入更多控制变量，需要进行饱和回归
. reg lninc college hukou##i.age, vce(cluster provcd)

Linear regression                               Number of obs     =      4,137
                                                F(23, 24)         =          .
                                                Prob > F          =          .
                                                R-squared         =     0.1501
                                                Root MSE          =     1.1316

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .7477749   .0544757    13.73   0.000     .6353425    .8602073
             |
       hukou |
   城镇户口  |  -.0787305   .1418798    -0.55   0.584    -.371555

In [23]:
%%stata
// 解决饱和回归待估参数过多的问题，可以使用幂函数对参数进行限定
reg lninc college hukou##(c.age c.age2 c.age3), vce(cluster provcd)
estat ic


. // 解决饱和回归待估参数过多的问题，可以使用幂函数对参数进行限定
. reg lninc college hukou##(c.age c.age2 c.age3), vce(cluster provcd)

Linear regression                               Number of obs     =      4,137
                                                F(8, 24)          =      88.50
                                                Prob > F          =     0.0000
                                                R-squared         =     0.1331
                                                Root MSE          =     1.1354

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .7547539   .0501103    15.06   0.000     .6513314    .8581763
             |
       hukou |
   城镇户口  |  -.5628695   3.164092   

In [24]:
%%stata
reg lninc college hukou age age2 age3, vce(cluster provcd)
estat ic


. reg lninc college hukou age age2 age3, vce(cluster provcd)

Linear regression                               Number of obs     =      4,137
                                                F(5, 24)          =     130.17
                                                Prob > F          =     0.0000
                                                R-squared         =     0.1304
                                                Root MSE          =     1.1367

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .7483616   .0474037    15.79   0.000     .6505252    .8461981
       hukou |   .2233219   .0657661     3.40   0.002     .0875874    .3590565
         age |   .6884785    .1

In [25]:
## 未考虑能力变量
%stata reg lninc college hukou age age2 age3 gender race sibling i.fmedu, vce(cluster provcd)


Linear regression                               Number of obs     =      4,137
                                                F(10, 24)         =     241.99
                                                Prob > F          =     0.0000
                                                R-squared         =     0.1601
                                                Root MSE          =     1.1178

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .7418545   .0426899    17.38   0.000     .6537468    .8299622
       hukou |   .2232857   .0579839     3.85   0.001     .1036129    .3429585
         age |   .6743558    .164432     4.10   0.000     .3349849    1.013727
        age2 |

In [27]:
%%stata

// 考虑户口变量,演示因果效应的异质性
reg lninc college if hukou==0, vce(cluster provcd)
reg lninc college if hukou==1, vce(cluster provcd)


. 
. // 考虑户口变量,演示因果效应的异质性
. reg lninc college if hukou==0, vce(cluster provcd)

Linear regression                               Number of obs     =      2,451
                                                F(1, 24)          =     477.21
                                                Prob > F          =     0.0000
                                                R-squared         =     0.1018
                                                Root MSE          =      1.238

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |    .873513   .0399865    21.85   0.000     .7909849    .9560412
       _cons |   9.249105   .1050262    88.06   0.000     9.032342    9.465869
-------------

In [28]:
%%stata

tab hukou
tab college if hukou==0
tab college if hukou==1


. 
. tab hukou

    3岁时户 |
     口性质 |      Freq.     Percent        Cum.
------------+-----------------------------------
   农村户口 |      2,451       59.25       59.25
   城镇户口 |      1,686       40.75      100.00
------------+-----------------------------------
      Total |      4,137      100.00

. tab college if hukou==0

     是否上 |
       大学 |      Freq.     Percent        Cum.
------------+-----------------------------------
         否 |      1,594       65.03       65.03
         是 |        857       34.97      100.00
------------+-----------------------------------
      Total |      2,451      100.00

. tab college if hukou==1

     是否上 |
       大学 |      Freq.     Percent        Cum.
------------+-----------------------------------
         否 |        900       53.38       53.38
         是 |        786       46.62      100.00
------------+-----------------------------------
      Total |      1,686      100.00

. 


In [31]:
%stata dis 34.97*65.03*59.25

134740.37


In [33]:
%stata dis 53.38*46.62*40.75

101409.46


In [None]:
dis 134740.37/(134740.37+101409.46)
dis 101409.46/(134740.37+101409.46)
dis .57057153*.873513+.6983452*.42942847

In [35]:
%%stata
// 回归调整估计量
teffects ra (lninc hukou age age2 age3 gender race sibling i.fmedu) (college), vce(robust)
teffects ra (lninc hukou age age2 age3 gender race sibling i.fmedu) (college), vce(robust) atet
teffects ra (lninc hukou age age2 age3 gender race sibling i.fmedu) (college), vce(robust) control(1) tlevel(0) atet


. // 回归调整估计量
. teffects ra (lninc hukou age age2 age3 gender race sibling i.fmedu) (college)
> , vce(robust)

Iteration 0:   EE criterion =  2.620e-24  
Iteration 1:   EE criterion =  9.251e-30  

Treatment-effects estimation                    Number of obs     =      4,137
Estimator      : regression adjustment
Outcome model  : linear
Treatment model: none
------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATE          |
     college |
 (是 vs 否)  |   .7887353   .0373127    21.14   0.000     .7156038    .8618667
-------------+----------------------------------------------------------------
POmean       |
     college |
         否  |   9.413724   .0271812   346.33   0.000      9.36045    9.466998
-----------------------------------------------------------------------

In [36]:
%%stata

// 手动实现会回归调整估计量
reg lninc hukou age age2 age3 gender race sibling i.fmedu if college==0
predict y0hat if college==1


. 
. // 手动实现会回归调整估计量
. reg lninc hukou age age2 age3 gender race sibling i.fmedu if college==0

      Source |       SS           df       MS      Number of obs   =     2,494
-------------+----------------------------------   F(9, 2484)      =     21.72
       Model |  299.291704         9  33.2546338   Prob > F        =    0.0000
    Residual |  3803.21617     2,484  1.53108541   R-squared       =    0.0730
-------------+----------------------------------   Adj R-squared   =    0.0696
       Total |  4102.50787     2,493  1.64561086   Root MSE        =    1.2374

------------------------------------------------------------------------------
       lninc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
       hukou |   .3400725   .0536769     6.34   0.000     .2348163    .4453286
         age |   .5876334   .2255975     2.60   0.009     .1452547    1.030012
        age2 |  -.0140172   .0056

In [38]:
%%stata
reg lninc hukou age age2 age3 gender race sibling i.fmedu if college==1
predict y1hat if college==0


. reg lninc hukou age age2 age3 gender race sibling i.fmedu if college==1

      Source |       SS           df       MS      Number of obs   =     1,643
-------------+----------------------------------   F(9, 1633)      =     12.17
       Model |  85.7486412         9   9.5276268   Prob > F        =    0.0000
    Residual |  1278.30719     1,633  .782796812   R-squared       =    0.0629
-------------+----------------------------------   Adj R-squared   =    0.0577
       Total |  1364.05584     1,642   .83072828   Root MSE        =    .88476

------------------------------------------------------------------------------
       lninc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
       hukou |   .0484465   .0485173     1.00   0.318    -.0467162    .1436091
         age |   .6205456   .1870398     3.32   0.001     .2536824    .9874087
        age2 |  -.0143115   .0049184    -2.91   0.004 

In [40]:
%%stata

gen y0=lninc if college==0
replace y0=y0hat if college==1

gen y1=lninc if college==1
replace y1=y1hat if college==0

gen effect=y1-y0
tab college, sum(effect)


. 
. gen y0=lninc if college==0
(1,643 missing values generated)

. replace y0=y0hat if college==1
(1,643 real changes made)

. 
. gen y1=lninc if college==1
(2,494 missing values generated)

. replace y1=y1hat if college==0
(2,494 real changes made)

. 
. gen effect=y1-y0

. tab college, sum(effect)

     是否上 |          Summary of effect
       大学 |        Mean   Std. dev.       Freq.
------------+------------------------------------
         否 |   .86617296   1.2711443       2,494
         是 |    .6711883   .92538643       1,643
------------+------------------------------------
      Total |   .78873525   1.1502225       4,137

. 
