In [1]:
import numpy as np
import pandas as pd
import math
import dowhy
from dowhy import CausalModel

from plotnine import ggplot, aes, geom_boxplot, geom_hline, ylim, xlab, ylab, theme_bw, theme, element_text, ggsave

import logging
logging.getLogger().setLevel(logging.CRITICAL)

In [147]:
def linear_dataset(num_samples,
                      beta, variance_eps, 
                      prob_treatment,
                      prob_outliers,
                      sampling_distribution,
                      square_treatment=False):
    # Sampling mechanism
    if sampling_distribution == "uniform":
        w = np.concatenate((np.random.uniform(0,0.1, int(num_samples/2)), np.random.uniform(0.9,1, int(num_samples/2))))
        w2 = np.random.uniform(0,1, num_samples)
        w3 = np.random.uniform(0,1, num_samples)
        w4 = np.random.uniform(0,1, num_samples)
        w=w2+w3 +np.random.uniform(0,0.2, num_samples)
    else:
        w = np.random.beta(1, 3, num_samples)
    #x = np.random.binomial(n=1, p=w*w2, size=num_samples)     
    x = w+w2 +w3+w4*w + np.random.uniform(0, 0.5, num_samples)
    #w3=10*x
    eps = np.random.normal(0, math.sqrt(variance_eps), num_samples)
    y = (beta)*(x) - 10*w + 10*w2*w3 - 10*w3*w + 10*w4+  eps
    
    df = pd.DataFrame({'treatment':x,"confounder":w, "w2":w2,"w3":w3, "w4":w4, "outcome":y})
    #df["treatment"] = df["treatment"].astype('bool')
    return df

In [172]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
def estimate_effect(model, df):
    df.loc[:, 'treatment'] = 1
    y1 = model.predict(df)
    df.loc[:,'treatment']=0
    y0=model.predict(df)
    return np.mean(y1-y0)

In [173]:
num_simulations=2
num_data_samples=10000
beta=10
var_eps=100
prob_treatment=None
prob_outliers=None
sampling_distr="uniform"
square_treatment=None
estimates = []
for i in range(num_simulations):
    data = linear_dataset(num_samples = num_data_samples,
                             beta=beta,
                             variance_eps=var_eps,
                             prob_treatment = prob_treatment,
                             prob_outliers=prob_outliers,
                             sampling_distribution=sampling_distr,
                             square_treatment=square_treatment)
    print(data)
    
    model= RandomForestRegressor().fit(data[["treatment", "confounder", "w2", "w3", "w4"]], data["outcome"])
    print(estimate_effect(model, data[["treatment", "confounder", "w2", "w3", "w4"]]))
    break
    cmodel = CausalModel(data,
                    treatment="treatment",
                    outcome="outcome",
                    common_causes=['confounder','w2','w3','w4'])
    identified_estimand = cmodel.identify_effect(proceed_when_unidentifiable=True)
    estimate = cmodel.estimate_effect(identified_estimand, method_name = "backdoor.linear_regression")
    print(estimate.estimator.model.summary())
    estimates.append(estimate.value)
print(estimates)

      treatment  confounder        w2        w3        w4    outcome
0      4.002741    1.688478  0.672459  0.951274  0.397823  28.334294
1      4.447327    1.662236  0.748267  0.740127  0.643867  30.456343
2      4.692952    1.474242  0.712632  0.748723  0.941381  20.216946
3      2.381844    0.941237  0.393392  0.389862  0.511976  28.508415
4      2.819261    1.133730  0.592248  0.353053  0.292743  20.662350
...         ...         ...       ...       ...       ...        ...
9995   1.162159    0.419964  0.107169  0.235419  0.594514  16.181771
9996   3.699581    1.287213  0.547282  0.566073  0.885880  15.384491
9997   3.073512    1.324327  0.336897  0.932815  0.039565  20.801911
9998   2.104396    0.905148  0.710852  0.065974  0.410212   0.283615
9999   1.865562    0.679431  0.660775  0.008469  0.306505   4.588520

[10000 rows x 6 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


1.1255522154444633
[]
