## 配置stata

In [8]:
import os
import stata_setup

## 设置pystata目录
stata_setup.config(os.getenv("STATA_SYSDIR"), 'mp')

## 使用魔法指令

### cell magic 基本指令

In [9]:
%%stata
## 导入数据
sysuse auto, clear
list


. ## 导入数据
Unknown #command
. sysuse auto, clear
(1978 automobile data)

. list

     +----------------------------------------------------------------------+
  1. | make              |  price | mpg | rep78 | headroom | trunk | weight |
     | AMC Concord       |  4,099 |  22 |     3 |      2.5 |    11 |  2,930 |
     |----------------------------------------------------------------------|
     |   length   |   turn   |   displa~t   |   gear_r~o    |    foreign    |
     |      186   |     40   |        121   |       3.58    |   Domestic    |
     +----------------------------------------------------------------------+

     +----------------------------------------------------------------------+
  2. | make              |  price | mpg | rep78 | headroom | trunk | weight |
     | AMC Pacer         |  4,749 |  17 |     3 |      3.0 |    11 |  3,350 |
     |----------------------------------------------------------------------|
     |   length   |   turn   |   displa~t   |   gear_r~o    

#### cell magic 指令的参数功能

In [7]:
%%stata -eret myeret -ret myret -sret mysret 
## 将Stata 语句中的e() r()和 s()传递到 Python变量 myeret myret和mysret中
reg mpg price i.foreign


. // 将Stata 语句中的e() r()和 s()传递到 Python变量 myeret myret和mysret中
. reg mpg price i.foreign

      Source |       SS           df       MS      Number of obs   =        74
-------------+----------------------------------   F(2, 71)        =     23.01
       Model |  960.866305         2  480.433152   Prob > F        =    0.0000
    Residual |  1482.59315        71  20.8815937   R-squared       =    0.3932
-------------+----------------------------------   Adj R-squared   =    0.3761
       Total |  2443.45946        73  33.4720474   Root MSE        =    4.5696

------------------------------------------------------------------------------
         mpg | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
       price |   -.000959   .0001815    -5.28   0.000     -.001321    -.000597
             |
     foreign |
    Foreign  |   5.245271   1.163592     4.51   0.000     2.925135    7.565407
       _co

In [10]:
myeret

{'e(N)': 74.0,
 'e(df_m)': 2.0,
 'e(df_r)': 71.0,
 'e(F)': 23.007494485746342,
 'e(r2)': 0.39324012569622946,
 'e(rmse)': 4.569638248831391,
 'e(mss)': 960.8663049714787,
 'e(rss)': 1482.5931544879809,
 'e(r2_a)': 0.3761482982510528,
 'e(ll)': -215.90831771275379,
 'e(ll_0)': -234.39433764823468,
 'e(rank)': 3.0,
 'e(cmdline)': 'regress mpg price i.foreign',
 'e(title)': 'Linear regression',
 'e(marginsprop)': 'minus',
 'e(marginsok)': 'XB default',
 'e(vce)': 'ols',
 'e(depvar)': 'mpg',
 'e(cmd)': 'regress',
 'e(properties)': 'b V',
 'e(predict)': 'regres_p',
 'e(model)': 'ols',
 'e(estat_cmd)': 'regress_estat',
 'e(b)': array([[-9.59034169e-04,  0.00000000e+00,  5.24527100e+00,
          2.56505843e+01]]),
 'e(V)': array([[ 3.29592449e-08,  0.00000000e+00, -1.02918123e-05,
         -2.00142479e-04],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         -0.00000000e+00],
        [-1.02918123e-05,  0.00000000e+00,  1.35394617e+00,
         -3.39072871e-01],
        [-2.0

In [11]:
myret

{'r(PT_has_legend)': 0.0,
 'r(PT_has_cnotes)': 0.0,
 'r(PT_k_ctitles)': 1.0,
 'r(level)': 95.0,
 'r(PT_rseps)': '`""\' `""\' `""\' `""\' `""\'',
 'r(PT_rnotes)': '`""\' `""\' `""\' `""\' `""\'',
 'r(PT_raligns)': '`"right"\' `"right"\' `"right"\' `"level"\' `"right"\'',
 'r(PT_rtitles)': '`"price"\' `""\' `"foreign"\' `"Foreign"\' `"_cons"\'',
 'r(PT_cformats)': '`"%9.0g"\' `"%9.0g"\' `"%8.2f"\' `"%5.3f"\' `"%9.0g"\' `"%9.0g"\'',
 'r(PT_cspans1)': '`"1"\' `"1"\' `"1"\' `"1"\' `"2"\' `"0"\'',
 'r(PT_ctitles1)': '`"Coefficient"\' `"Std. err."\' `"t"\' `"P>|t|"\' `"[95% conf. interval]"\' `""\'',
 'r(PT_corner1)': 'mpg',
 'r(put_tables)': 'PT',
 'r(citype)': 'normal',
 'r(label2)': '(base)',
 'r(_collect_prefix_get)': 'ignore',
 'r(PT)': array([[-9.59034169e-004,  1.81546812e-004, -5.28257235e+000,
          1.33306581e-006, -1.32102823e-003, -5.97040108e-004],
        [ 8.99285457e+307,  8.99285457e+307,  8.99285457e+307,
          8.99285457e+307,  8.99285457e+307,  8.99285457e+307],
  

In [12]:
mysret

{'s(width_col1)': '13', 's(width)': '78'}

In [13]:
# 该指令默认获取和传递的数据将以字典列表的形式储存，我们可以指定索引直接调用其中的任意一项数据。
e_b = myeret['e(b)']
e_b

array([[-9.59034169e-04,  0.00000000e+00,  5.24527100e+00,
         2.56505843e+01]])

In [8]:
%%stata -eret myeret -ret myret -sret mysret
ds

make        rep78         weight        displacement
price         headroom      length        gear_ratio
mpg           trunk         turn          foreign


In [9]:
from pystata import stata
# 把stata数据集全部输出到pd.df
var_list = myret['r(varlist)']
df_stata=stata.pdataframe_from_data(var_list)
df_stata

Unnamed: 0,make,price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign
0,AMC Concord,4099,22,3.000000e+00,2.5,11,2930,186,40,121,3.58,0
1,AMC Pacer,4749,17,3.000000e+00,3.0,11,3350,173,40,258,2.53,0
2,AMC Spirit,3799,22,8.988466e+307,3.0,12,2640,168,35,121,3.08,0
3,Buick Century,4816,20,3.000000e+00,4.5,16,3250,196,40,196,2.93,0
4,Buick Electra,7827,15,4.000000e+00,4.0,20,4080,222,43,350,2.41,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69,VW Dasher,7140,23,4.000000e+00,2.5,12,2160,172,36,97,3.74,1
70,VW Diesel,5397,41,5.000000e+00,3.0,15,2040,155,35,90,3.78,1
71,VW Rabbit,4697,25,4.000000e+00,3.0,15,1930,155,35,89,3.78,1
72,VW Scirocco,6850,25,4.000000e+00,2.0,16,1990,156,36,97,3.78,1


#### -d（加载外部数据到 Stata ）

In [11]:
# 导入一个外部数据到 Python 中
from sklearn import datasets
import pandas as pd
bos = datasets.load_boston()
boston = pd.DataFrame(bos.data)
boston.columns = bos.feature_names
boston['MEDV'] = bos.target
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [12]:
%%stata -d boston -force
describe

{'pknum': True, 'pkpand': True}

Contains data
 Observations:           506                  
    Variables:            14                  
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
CRIM            double  %10.0g                
ZN              double  %10.0g                
INDUS           double  %10.0g                
CHAS            double  %10.0g                
NOX             double  %10.0g                
RM              double  %10.0g                
AGE             double  %10.0g                
DIS             double  %10.0g                
RAD             double  %10.0g                
TAX             double  %10.0g                
PTRATIO         double  %10.0g                
B               double  %10.0g                
LSTAT           double  %10.0g   

#### -f（加载多个外部数据到 Stata 中的不同 Frame）

In [13]:
CHAS0 = boston[boston['CHAS']==0]
CHAS1 = boston[boston['CHAS']==1]

CHAS0
CHAS0

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [16]:
%%stata -f CHAS0,CHAS1 -force
frames dir

* CHAS0    471 x 14
* CHAS1    35 x 14
* default  506 x 14

Note: Frames marked with * contain unsaved data.


### line magic 基本指令

In [17]:
%stata sysuse auto, clear

(1978 automobile data)


## 使用 pystata.stata 函数实现交互

In [18]:
from pystata import stata
stata.run('sysuse auto, clear')

(1978 automobile data)


 ### 多行 stata 指令运算

In [19]:
stata.run('''
summarize
reg mpg price i.foreign
ereturn list
''')


. 
. summarize

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
        make |          0
       price |         74    6165.257    2949.496       3291      15906
         mpg |         74     21.2973    5.785503         12         41
       rep78 |         69    3.405797    .9899323          1          5
    headroom |         74    2.993243    .8459948        1.5          5
-------------+---------------------------------------------------------
       trunk |         74    13.75676    4.277404          5         23
      weight |         74    3019.459    777.1936       1760       4840
      length |         74    187.9324    22.26634        142        233
        turn |         74    39.64865    4.399354         31         51
displacement |         74    197.2973    91.83722         79        425
-------------+---------------------------------------------------------
  gear_ratio |       

### pystata.stata 中的交互函数

#### get_return（）、get_ereturn（）和 get_sreturn（）（参数传递）

In [20]:
myeret = stata.get_ereturn()
mysret = stata.get_sreturn()

df_myeret = pd.DataFrame.from_dict(myeret,orient='index')
df_myret = pd.DataFrame.from_dict(myret,orient='index')
df_mysret = pd.DataFrame.from_dict(mysret,orient='index')

df_myeret
df_myret
df_mysret

Unnamed: 0,0
s(width_col1),13
s(width),78


#### stata.pdataframe_from_data()（导出 Stata 当前数据集到 Python）

In [21]:
# 1、默认传递当前 stata 所使用的整个数据集
myauto = stata.pdataframe_from_data()
myauto.head()

Unnamed: 0,make,price,mpg,rep78,headroom,trunk,weight,length,turn,displacement,gear_ratio,foreign
0,AMC Concord,4099,22,3.0,2.5,11,2930,186,40,121,3.58,0
1,AMC Pacer,4749,17,3.0,3.0,11,3350,173,40,258,2.53,0
2,AMC Spirit,3799,22,8.988466e+307,3.0,12,2640,168,35,121,3.08,0
3,Buick Century,4816,20,3.0,4.5,16,3250,196,40,196,2.93,0
4,Buick Electra,7827,15,4.0,4.0,20,4080,222,43,350,2.41,0


In [22]:
# 2、指定参数选择当前 stata 所使用数据集的子集
# 例如，我们可将变量 mpg 和 price 的前10个观测值存储到 pandas DataFrame 中
myauto1 = stata.pdataframe_from_data('mpg price', range(10))
myauto1

Unnamed: 0,mpg,price
0,22,4099
1,17,4749
2,22,3799
3,20,4816
4,15,7827
5,18,5788
6,26,4453
7,20,5189
8,16,10372
9,19,4082


#### stata.pdataframe_to_data()（加载外部数据到 Stata ）

In [23]:
stata.pdataframe_to_data(myauto, force=True)
stata.run('list in 1/3')

{'pknum': True, 'pkpand': True}

     +------------------------------------------------------------------------+
  1. |        make | price | mpg | rep78 | headroom | trunk | weight | length |
     | AMC Concord |  4099 |  22 |     3 |      2.5 |    11 |   2930 |    186 |
     |------------------------------------------------------------------------|
     |     turn     |     displa~t     |     gear_ra~o     |     foreign      |
     |       40     |          121     |     3.5799999     |           0      |
     +------------------------------------------------------------------------+

     +------------------------------------------------------------------------+
  2. |        make | price | mpg | rep78 | headroom | trunk | weight | length |
     |   AMC Pacer |  4749 |  17 |     3 |        3 |    11 |   3350 |    173 |
     |------------------------------------------------------------------------|
     |     turn     |     displa~t     |     gear_ra~o     |     foreign      |
     |

#### 其他 pystata.stata 函数介绍

## Stata Function Interface（sfi） 简介

### sfi 接口函数简介

Stata 的 Python API 官方文档： <https://www.stata.com/python/api17/index.html>

sfi 模块允许用户将 Python 的功能与 Stata 的核心功能进行交互。该模块可以交互使用，也可以在 do 文件和 ado 文件中使用.
在模块中，定义了 Class 用来访问 Stata 特征、当前数据集、Frame、日期和时间、宏、标量、矩阵、值标签、全局Mata矩阵、缺失值等。

### 常用 sfi.Data 函数实示例

In [24]:
%stata sysuse auto, clear
from sfi import Data
price = Data.get(var='price')
price

price1 = pd.Series(price)
price1

Data.renameVar('mpg', 'MileagePerGallon')
%stata list

Data.dropVar("make")
%stata list

Data.keepVar("MileagePerGallon")

%stata list

(1978 automobile data)

     +------------------------------------------------------------------+
  1. | make              |  price | Mileag~n | rep78 | headroom | trunk |
     | AMC Concord       |  4,099 |       22 |     3 |      2.5 |    11 |
     |-------------------+----------------------------------------------|
     | weight  | length  | turn  |  displa~t  |  gear_r~o  |   foreign  |
     |  2,930  |    186  |   40  |       121  |      3.58  |  Domestic  |
     +------------------------------------------------------------------+

     +------------------------------------------------------------------+
  2. | make              |  price | Mileag~n | rep78 | headroom | trunk |
     | AMC Pacer         |  4,749 |       17 |     3 |      3.0 |    11 |
     |-------------------+----------------------------------------------|
     | weight  | length  | turn  |  displa~t  |  gear_r~o  |   foreign  |
     |  3,350  |    173  |   40  |       258  |      2.53  |  Domestic  |
     +-------