In [12]:
# import libraries
import pandas as pd
import statsmodels.api as sm

In [13]:
# get the dataset
dataset = sm.datasets.get_rdataset("Wages",
                                   package = "plm")

In [14]:
dataset

<class 'statsmodels.datasets.utils.Dataset'>

In [15]:
dataset = dataset.data

In [16]:
dataset

Unnamed: 0,exp,wks,bluecol,ind,south,smsa,married,sex,union,ed,black,lwage
0,3,32,no,0,yes,no,yes,male,no,9,no,5.56068
1,4,43,no,0,yes,no,yes,male,no,9,no,5.72031
2,5,40,no,0,yes,no,yes,male,no,9,no,5.99645
3,6,39,no,0,yes,no,yes,male,no,9,no,5.99645
4,7,42,no,1,yes,no,yes,male,no,9,no,6.06146
...,...,...,...,...,...,...,...,...,...,...,...,...
4160,3,50,no,0,no,yes,no,female,no,12,no,5.95324
4161,4,49,no,0,no,yes,no,female,no,12,no,6.06379
4162,5,50,no,0,no,yes,no,female,no,12,no,6.21461
4163,6,50,no,0,no,yes,no,female,no,12,no,6.29157


##### Description

Name of dataset : Panel Data of Individual Wages

A panel of 595 individuals from 1976 to 1982 taken from the Panel study of income dynamics (PSID).

Dataframe columns explanation:

exp: years of full time experience

wks: weeks worked

bluecol: blue collar?

ind: works in a manufacturing industry?

south: resides in the south?

smsa: resides in a standard metropolitan statistical area?

married: married?

sex: a factor with levels "male" and "female"

union: individual wage set by a union contract?

ed: years of education

black: is the individual black?

lwage: logarithm of wage

In [17]:
# transform the categorical into dummy variables
dataset = pd.get_dummies(data=dataset, 
                         drop_first=True,
                         dtype=int)
dataset

Unnamed: 0,exp,wks,ind,ed,lwage,bluecol_yes,south_yes,smsa_yes,married_yes,sex_male,union_yes,black_yes
0,3,32,0,9,5.56068,0,1,0,1,1,0,0
1,4,43,0,9,5.72031,0,1,0,1,1,0,0
2,5,40,0,9,5.99645,0,1,0,1,1,0,0
3,6,39,0,9,5.99645,0,1,0,1,1,0,0
4,7,42,1,9,6.06146,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4160,3,50,0,12,5.95324,0,0,1,0,0,0,0
4161,4,49,0,12,6.06379,0,0,1,0,0,0,0
4162,5,50,0,12,6.21461,0,0,1,0,0,0,0
4163,6,50,0,12,6.29157,0,0,1,0,0,0,0


In [19]:
# isolate the Y, x, confounders
Y = dataset.loc[:,"lwage"].values
X = dataset.loc[:, "sex_male"].values
confounders = dataset.drop(columns=["sex_male", "lwage"]).values

In [20]:
# !pip install CausalInference

Collecting CausalInference
  Downloading CausalInference-0.1.3-py3-none-any.whl (51 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 KB[0m [31m624.0 kB/s[0m eta [36m0:00:00[0mMB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: CausalInference
Successfully installed CausalInference-0.1.3


In [21]:
# propensity score matching
from causalinference import CausalModel

In [22]:
model = CausalModel(Y, X, confounders)
model.est_via_matching(bias_adj=True)
print(model.estimates)


Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE      0.276      0.065      4.215      0.000      0.148      0.404
           ATC      0.421      0.062      6.789      0.000      0.299      0.542
           ATT      0.258      0.072      3.566      0.000      0.116      0.399



  return np.linalg.lstsq(X, Y)[0][1:]  # don't need intercept coef
