In [146]:
import os
import pandas as pd
import stata_setup
from numpy import percentile
from tensorboard.notebook import display

## 设置pystata目录
stata_setup.config(os.getenv("STATA_SYSDIR"), 'mp')

In [35]:
%%stata
use "cfps2010.dta", clear
describe


. 
. use "cfps2010.dta", clear

. describe

Contains data from cfps2010.dta
 Observations:         4,137                  
    Variables:            12                  5 Aug 2022 19:25
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
pid             double  %12.0g     pid        个人id
provcd          double  %24.0g     provcd     省国标码
gender          double  %12.0g     gender     性别
age             float   %9.0g                 年龄
age2            float   %9.0g                 年龄平方
age3            float   %9.0g                 年龄三次方
lninc           float   %9.0g                 收入对数
college         double  %9.0g      yesorno    是否上大学
hukou           double  %12.0g     hukou      3岁时户口性质
sibling         float   %9.0g      yesorno    是否独生子女
race            double  %9.0g      yes

In [149]:
from sfi import Data

college = Data.get(var='college')
pd.DataFrame(college).value_counts()

0.0    2494
1.0    1643
dtype: int64

In [110]:
from pystata import stata

cfps = stata.pdataframe_from_data()
print(cfps.shape)
cfps.head()

(4137, 12)


Unnamed: 0,pid,provcd,gender,age,age2,age3,lninc,college,hukou,sibling,race,fmedu
0,110003101.0,11.0,1.0,39.0,1521.0,59319.0,11.512925,0.0,0.0,0.0,1.0,2.0
1,110005102.0,11.0,1.0,48.0,2304.0,110592.0,9.392662,0.0,1.0,0.0,1.0,0.0
2,110006101.0,11.0,1.0,50.0,2500.0,125000.0,10.308952,0.0,0.0,0.0,1.0,2.0
3,110009102.0,11.0,1.0,53.0,2809.0,148877.0,9.392662,0.0,1.0,0.0,1.0,0.0
4,110009105.0,11.0,1.0,26.0,676.0,17576.0,8.699514,1.0,1.0,1.0,1.0,1.0


In [72]:
%%stata
## 使用线性回归，不控制任何变量
reg lninc college, vce(cluster provcd)


. ## 使用线性回归，不控制任何变量
Unknown #command
. reg lninc college, vce(cluster provcd)

Linear regression                               Number of obs     =      4,137
                                                F(1, 24)          =     271.17
                                                Prob > F          =     0.0000
                                                R-squared         =     0.1095
                                                Root MSE          =     1.1498

                                (Std. err. adjusted for 25 clusters in provcd)
------------------------------------------------------------------------------
             |               Robust
       lninc | Coefficient  std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |    .823612   .0500155    16.47   0.000     .7203851     .926839
       _cons |   9.353189   .1084703    86.23   0.000     9.129317    9.577061
--------------

In [111]:
## 查其他协变量在匹配上的不平衡程度
cfps[['college', 'hukou', 'sibling','fmedu']].groupby('college').mean().unstack(level=1)

         college
hukou    0.0        0.360866
         1.0        0.478393
sibling  0.0        0.097835
         1.0        0.230676
fmedu    0.0        0.650361
         1.0        0.669507
dtype: float64

In [113]:
## 精确匹配（协变量是户口）， ate 减少了
stata.run('teffects nnmatch (lninc) (college), ate ematch(hukou) nneighbor(1) level(95)')


Treatment-effects estimation                   Number of obs      =      4,137
Estimator      : nearest-neighbor matching     Matches: requested =          1
Outcome model  : matching                                     min =        786
Distance metric: Mahalanobis                                  max =       1594
------------------------------------------------------------------------------
             |              AI robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATE          |
     college |
 (是 vs 否)  |   .8021249   .0337823    23.74   0.000     .7359127     .868337
------------------------------------------------------------------------------


In [122]:
## 增加协变量 ematch(hukou age gender race sibling fmedu), 连续性变量 age 会导致一些个案找不到匹配个案
stata.run('teffects nnmatch (lninc) (college), ematch(hukou age gender race sibling fmedu) osample(unmathch)')
stata.run('tab unmathch')


    overlap |
  violation |
  indicator |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |      3,544       85.67       85.67
          1 |        593       14.33      100.00
------------+-----------------------------------
      Total |      4,137      100.00


Exception in thread Stata:
Traceback (most recent call last):
  File "D:\ProgramData\anaconda3\envs\pystata\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "D:\ProgramData\stata17\utilities\pystata\core\stout.py", line 169, in run
    raise SystemError(output)
SystemError: invalid osample(newvarname) specification; variable unmathch already exists
r(110);




In [125]:
## 精确匹配易遭到维度匹配，导致匹配后的样本量大下降。为了避免损失过多样本，尝试对 age、race和siblling 实施马氏匹配，其他变量依然采用精确匹配
stata.run('teffects nnmatch (lninc age race sibling) (college), ematch(hukou gender fmedu)')


Treatment-effects estimation                   Number of obs      =      4,137
Estimator      : nearest-neighbor matching     Matches: requested =          1
Outcome model  : matching                                     min =          1
Distance metric: Mahalanobis                                  max =         50
------------------------------------------------------------------------------
             |              AI robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATE          |
     college |
 (是 vs 否)  |   .7699847    .041446    18.58   0.000     .6887521    .8512174
------------------------------------------------------------------------------


In [126]:
## 将马氏匹配默认的 1对1匹配 换成 1对4匹配，同时使用 1对4 的稳健标准误
stata.run('teffects nnmatch (lninc age race sibling) (college), ematch(hukou gender fmedu) nneighbor(4) vce(robust, nn(4))')


Treatment-effects estimation                   Number of obs      =      4,137
Estimator      : nearest-neighbor matching     Matches: requested =          4
Outcome model  : matching                                     min =          4
Distance metric: Mahalanobis                                  max =         50
------------------------------------------------------------------------------
             |              AI robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATE          |
     college |
 (是 vs 否)  |   .7936157   .0392848    20.20   0.000     .7166189    .8706125
------------------------------------------------------------------------------


In [129]:
## 使用偏差估计校正协变量差异
stata.run('teffects nnmatch (lninc age race sibling) (college), ematch(hukou gender fmedu) nneighbor(4) vce(robust, nn(4)) \
          biasadj(age race sibling)')


Treatment-effects estimation                   Number of obs      =      4,137
Estimator      : nearest-neighbor matching     Matches: requested =          4
Outcome model  : matching                                     min =          4
Distance metric: Mahalanobis                                  max =         50
------------------------------------------------------------------------------
             |              AI robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATE          |
     college |
 (是 vs 否)  |   .7949957   .0392914    20.23   0.000      .717986    .8720054
------------------------------------------------------------------------------


In [135]:
stata.run('teffects nnmatch (lninc age race sibling) (college), ematch(hukou gender fmedu) nneighbor(4) vce(robust, nn(4)) \
          biasadj(age race sibling) atet')


Treatment-effects estimation                   Number of obs      =      4,137
Estimator      : nearest-neighbor matching     Matches: requested =          4
Outcome model  : matching                                     min =          4
Distance metric: Mahalanobis                                  max =         50
------------------------------------------------------------------------------
             |              AI robust
       lninc | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATET         |
     college |
 (是 vs 否)  |   .6993424   .0475923    14.69   0.000     .6060633    .7926215
------------------------------------------------------------------------------


In [139]:
## 计算 干预组和控制组在所有协变量上的不平衡性
%stata imb hukou age gender race sibling fmedu, treatment(college)

(using the scott break method for L1 distance)

Multivariate L1 distance: .42932057

Univariate imbalance:

              L1     mean      min      25%      50%      75%      max
  hukou   .11753   .11753        0        0        0        0        0
    age   .32978  -6.5436        0       -6       -9       -9        0
 gender   .04055  -.04055        0        0        0        0        0
   race   .01072  -.01072        0        0        0        0        0
sibling   .13284   .13284        0        0        0        0        0
  fmedu   .22349   .01915        0        0        1       -1        0


In [155]:
%%stata
// age 分为等间距的 6 类, fmedu不处理, 其他变量按照算法自动分类
cem hukou age (30 35 40 45 50) gender race sibling fmedu(#0), treatment(college)


. // age 分为6类, fmedu不处理, 其他变量按照算法自动分类
. cem hukou age (30 35 40 45 50) gender race sibling fmedu(#0), treatment(colle
> ge)
(using the scott break method for imbalance)

Matching Summary:
-----------------
Number of strata: 206
Number of matched strata: 133

              0     1
      All  2494  1643
  Matched  2409  1583
Unmatched    85    60


Multivariate L1 distance: .20630897

Univariate imbalance:

               L1      mean       min       25%       50%       75%       max
  hukou   8.3e-16   1.1e-16         0         0         0         0         0
    age    .07527    -.1677         0         0         1         0         0
 gender   1.4e-15   6.7e-16         0         0         0         0         0
   race   9.0e-16   1.8e-15         0         0         0         0         0
sibling   2.4e-15  -1.9e-16         0         0         0         0         0
  fmedu   3.7e-16   2.8e-15         0         0         0         0         0

. 


In [160]:
%%stata
// age 改为等规模的 6 分类变量
cem hukou age (#6) gender race sibling fmedu (#0), treatment(college)


. // age 改为等规模的 6 分类变量
. cem hukou age (#6) gender race sibling fmedu (#0), treatment(college)
(using the scott break method for imbalance)

Matching Summary:
-----------------
Number of strata: 179
Number of matched strata: 118

              0     1
      All  2494  1643
  Matched  2421  1579
Unmatched    73    64


Multivariate L1 distance: .22223609

Univariate imbalance:

              L1     mean      min      25%      50%      75%      max
  hukou  5.3e-15  4.6e-15        0        0        0        0        0
    age   .08636  -.24076        0        0        0        0        0
 gender  6.0e-15  7.2e-15        0        0        0        0        0
   race  1.2e-15  2.1e-15        0        0        0        0        0
sibling  4.1e-15  1.7e-15        0        0        0        0        0
  fmedu  7.4e-15  5.3e-15        0        0        0        0        0

. 


In [165]:
## 使用自动化粗化算法, 损失的样本量更多，但是不平衡指数却比之前两种方法有较大幅度的下降
## 这体现了粗化匹配过程的常见现象：排除在外的极端样本越多，匹配样本的平衡性通常月能够得到保证
stata.run('cem hukou age gender race sibling fmedu (#0), treatment(college)')

(using the scott break method for imbalance)

Matching Summary:
-----------------
Number of strata: 369
Number of matched strata: 214

              0     1
      All  2494  1643
  Matched  2330  1523
Unmatched   164   120


Multivariate L1 distance: .09076546

Univariate imbalance:

               L1      mean       min       25%       50%       75%       max
  hukou   1.1e-15  -6.7e-16         0         0         0         0         0
    age    .02518   -.02531         0         0         0         0         0
 gender   1.8e-15  -2.3e-15         0         0         0         0         0
   race   8.7e-18         0         0         0         0         0         0
sibling   1.3e-15  -3.1e-16         0         0         0         0         0
  fmedu   1.6e-15   1.2e-15         0         0         0         0         0


In [167]:
## 使用生成的样本权重信息进行线性回归
%stata reg lninc college [iw=cem_weights]


      Source |       SS           df       MS      Number of obs   =     3,852
-------------+----------------------------------   F(1, 3850)      =    333.41
       Model |  450.866614         1  450.866614   Prob > F        =    0.0000
    Residual |  5207.69062     3,850  1.35264691   R-squared       =    0.0797
-------------+----------------------------------   Adj R-squared   =    0.0797
       Total |  5658.55723     3,851  1.46937347   Root MSE        =    1.1629

------------------------------------------------------------------------------
       lninc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |   .6996738   .0383184    18.26   0.000     .6245475    .7748001
       _cons |    9.47577   .0240912   393.33   0.000     9.428537    9.523003
------------------------------------------------------------------------------


In [188]:
## 加入协变量后，系数没有发生什么变化，主要是因为：粗化精确匹配已经在很大程度上消除了两组在原始协变量上的不平衡性
%stata reg lninc college hukou age gender race sibling i.fmedu [iw=cem_weights]


      Source |       SS           df       MS      Number of obs   =     3,853
-------------+----------------------------------   F(8, 3844)      =     59.25
       Model |  621.146148         8  77.6432685   Prob > F        =    0.0000
    Residual |  5037.41109     3,844  1.31046074   R-squared       =    0.1098
-------------+----------------------------------   Adj R-squared   =    0.1079
       Total |  5658.55723     3,852  1.46899201   Root MSE        =    1.1448

------------------------------------------------------------------------------
       lninc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     college |    .699765   .0377211    18.55   0.000     .6258097    .7737203
       hukou |   .1384669   .0417004     3.32   0.001     .0567099    .2202238
         age |   .0036032    .002706     1.33   0.183    -.0017021    .0089086
      gender |   .3631285   .0378722     9.59   0.

In [242]:
%%stata
use "birthweight.dta", clear
summarize


. use "birthweight.dta", clear
(Excerpt from Cattaneo (2010) Journal of Econometrics 155: 138-154)

. summarize

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
     bweight |      4,642     3361.68    578.8196        340       5500
    mmarried |      4,642    .6996984    .4584385          0          1
       mhisp |      4,642    .0340371    .1813439          0          1
       fhisp |      4,642     .037053     .188912          0          1
     foreign |      4,642    .0534252    .2249042          0          1
-------------+---------------------------------------------------------
     alcohol |      4,642    .0323137    .1768508          0          1
    deadkids |      4,642     .259371    .4383367          0          1
        mage |      4,642    26.50452    5.619026         13         45
        medu |      4,642    12.68957    2.520661          0         17
        fage |      4,

In [243]:
%stata describe bweight mbsmoke mrace


Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
bweight         int     %9.0g                 infant birthweight (grams)
mbsmoke         byte    %9.0g      mbsmoke    1 if mother smoked
mrace           byte    %9.0g                 1 if mother is white


In [244]:
%%stata
// 以mrace为协变量实施精确匹配
teffects nnmatch (bweight) (mbsmoke), ematch(mrace)
teffects nnmatch (bweight) (mbsmoke), ematch(mrace) atet


. // 以mrace为协变量实施精确匹配
. teffects nnmatch (bweight) (mbsmoke), ematch(mrace)

Treatment-effects estimation                   Number of obs      =      4,642
Estimator      : nearest-neighbor matching     Matches: requested =          1
Outcome model  : matching                                     min =        165
Distance metric: Mahalanobis                                  max =       3203
------------------------------------------------------------------------------
             |              AI robust
     bweight | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATE          |
     mbsmoke |
    (smoker  |
         vs  |
 nonsmoker)  |  -266.0654    20.9053   -12.73   0.000     -307.039   -225.0918
------------------------------------------------------------------------------

. teffects nnmatch (bweight) (mbsmoke), ematch(mrace) atet

Treatment-effects estimation                   Numbe

In [245]:
%%stata
// 以mage、medu、mrace、nprenatal、mmarried、deadkids、fbaby为协变量实施精确匹配，出现维度诅咒问题
teffects nnmatch (bweight) (mbsmoke), ematch(mage medu mrace nprenatal mmarried deadkids fbaby) osample(overlap)


. // 以mage、medu、mrace、nprenatal、mmarried、deadkids、fbaby为协变量实施精确
> 匹配，出现维度诅咒问题
. teffects nnmatch (bweight) (mbsmoke), ematch(mage medu mrace nprenatal mmarri
> ed deadkids fbaby) osample(overlap)


Exception in thread Stata:
Traceback (most recent call last):
  File "D:\ProgramData\anaconda3\envs\pystata\lib\threading.py", line 926, in _bootstrap_inner
    self.run()
  File "D:\ProgramData\stata17\utilities\pystata\core\stout.py", line 176, in run
    raise SystemError(output)
SystemError: 3678 observations have no exact matches; they are identified in the osample()
variable
r(459);
r(459);




In [281]:
## 3678 observations have no exact matches
%stata tab overlap


    overlap |
  violation |
  indicator |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |        964       20.77       20.77
          1 |      3,678       79.23      100.00
------------+-----------------------------------
      Total |      4,642      100.00


In [212]:
%%stata
// 以mage、medu、mrace、nprenatal、mmarried、deadkids、fbaby为协变量实施马氏匹配
teffects nnmatch (bweight mage medu mrace nprenatal mmarried deadkids fbaby) (mbsmoke)
teffects nnmatch (bweight mage medu mrace nprenatal mmarried deadkids fbaby) (mbsmoke), atet


. // 以mage、medu、mrace、nprenatal、mmarried、deadkids、fbaby为协变量实施马氏
> 匹配
. teffects nnmatch (bweight mage medu mrace nprenatal mmarried deadkids fbaby) 
> (mbsmoke)

Treatment-effects estimation                   Number of obs      =      4,642
Estimator      : nearest-neighbor matching     Matches: requested =          1
Outcome model  : matching                                     min =          1
Distance metric: Mahalanobis                                  max =         18
------------------------------------------------------------------------------
             |              AI robust
     bweight | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATE          |
     mbsmoke |
    (smoker  |
         vs  |
 nonsmoker)  |  -204.7886   28.22883    -7.25   0.000    -260.1161   -149.4611
------------------------------------------------------------------------------

. teffects nnmatch (bweight

In [217]:
%%stata

// 1对4马氏匹配，偏差矫正
teffects nnmatch (bweight mage medu mrace nprenatal mmarried deadkids fbaby) (mbsmoke), nneighbor(4) vce(robust, nn(4)) biasadj(mage medu mrace nprenatal mmarried deadkids fbaby)

teffects nnmatch (bweight mage medu mrace nprenatal mmarried deadkids fbaby) (mbsmoke), nneighbor(4) vce(robust, nn(4)) biasadj(mage medu mrace nprenatal mmarried deadkids fbaby) atet


. 
. // 1对4马氏匹配，偏差矫正
. teffects nnmatch (bweight mage medu mrace nprenatal mmarried deadkids fbaby) 
> (mbsmoke), nneighbor(4) vce(robust, nn(4)) biasadj(mage medu mrace nprenatal 
> mmarried deadkids fbaby)

Treatment-effects estimation                   Number of obs      =      4,642
Estimator      : nearest-neighbor matching     Matches: requested =          4
Outcome model  : matching                                     min =          4
Distance metric: Mahalanobis                                  max =         19
------------------------------------------------------------------------------
             |              AI robust
     bweight | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
ATE          |
     mbsmoke |
    (smoker  |
         vs  |
 nonsmoker)  |  -209.3193   25.22532    -8.30   0.000      -258.76   -159.8786
------------------------------------------------------------

In [218]:
%%stata
// 计算不平衡指数L1
imb mage medu mrace nprenatal mmarried deadkids fbaby, treatment(mbsmoke)


. // 计算不平衡指数L1
. imb mage medu mrace nprenatal mmarried deadkids fbaby, treatment(mbsmoke)

Multivariate L1 distance: .82423889

Univariate imbalance:

                L1     mean      min      25%      50%      75%      max
     mage   .15622  -1.6438        1       -2       -2       -2       -2
     medu   .25757   -1.291        0       -1        0       -3        0
    mrace   .03878  -.03878        0        0        0        0        0
nprenatal   .13299  -1.1007        0       -1       -1       -1        7
 mmarried   .27808  -.27808        0       -1       -1        0        0
 deadkids   .07239   .07239        0        0        0        1        0
    fbaby   .08162  -.08162        0        0        0        0        0

. 


In [219]:
%%stata
// 粗化精确匹配
cem mage medu mrace nprenatal mmarried deadkids fbaby, treatment(mbsmoke)
cem mage medu (9 12) mrace nprenatal mmarried deadkids fbaby, treatment(mbsmoke)


. // 粗化精确匹配
. cem mage medu mrace nprenatal mmarried deadkids fbaby, treatment(mbsmoke)
(using the scott break method for imbalance)

Matching Summary:
-----------------
Number of strata: 1420
Number of matched strata: 314

              0     1
      All  3778   864
  Matched  2075   610
Unmatched  1703   254


Multivariate L1 distance: .69435965

Univariate imbalance:

                 L1      mean       min       25%       50%       75%
     mage    .05551    -.0264         0         0         1         0
     medu    .04097   -.04097         0         0         0         0
    mrace   1.2e-15  -1.7e-15         0         0         0         0
nprenatal    .06863    -.0344         0        -1        -1         0
 mmarried   3.2e-15  -3.8e-15         0         0         0         0
 deadkids   1.6e-15  -1.2e-15         0         0         0         0
    fbaby   3.2e-15  -1.9e-15         0         0         0         0

                max
     mage         0
     medu         0
    

In [227]:
%stata ereturn list


scalars:
                  e(N) =  4642
                 e(n0) =  3778
                 e(n1) =  864
            e(treated) =  1
            e(control) =  0
        e(k_nneighbor) =  4
            e(k_nnmin) =  4
            e(k_nnmax) =  19
           e(k_robust) =  4
           e(k_levels) =  2

macros:
                e(cmd) : "teffects"
            e(cmdline) : "teffects nnmatch (bweight mage medu mrace nprenat.."
            e(predict) : "teffects_p"
          e(estat_cmd) : "teffects_estat"
       e(marginsnotok) : "_ALL"
            e(vcetype) : "AI robust"
                e(vce) : "robust"
             e(depvar) : "bweight"
               e(tvar) : "mbsmoke"
             e(subcmd) : "nnmatch"
           e(mvarlist) : "mage medu mrace nprenatal mmarried deadkids fbaby"
          e(bavarlist) : "mage medu mrace nprenatal mmarried deadkids fbaby"
             e(metric) : "mahalanobis"
               e(stat) : "atet"
              e(title) : "Treatment-effects estimation"
        

In [228]:
%%stata
// 统计分析
reg bweight mbsmoke [iw=cem_weights]
reg bweight mbsmoke mage medu mrace nprenatal mmarried deadkids fbaby [iw=cem_weights]


. // 统计分析
. reg bweight mbsmoke [iw=cem_weights]

      Source |       SS           df       MS      Number of obs   =     3,128
-------------+----------------------------------   F(1, 3126)      =     58.87
       Model |  22246846.3         1  22246846.3   Prob > F        =    0.0000
    Residual |  1.1813e+09     3,126  377905.029   R-squared       =    0.0185
-------------+----------------------------------   Adj R-squared   =    0.0182
       Total |  1.2036e+09     3,127  384898.614   Root MSE        =    614.74

------------------------------------------------------------------------------
     bweight | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     mbsmoke |  -200.3425    26.1114    -7.67   0.000    -251.5397   -149.1453
       _cons |    3363.09   12.52746   268.46   0.000     3338.527    3387.653
------------------------------------------------------------------------------
