In [54]:
import dowhy.datasets
import numpy as np
import pandas as pd

# 1. Syntethic Data for linear problems:

# With this we generate a syntethic dataset, in order to see wheater our methods work well or not

# The real model is:
# y=b0+b1*v0+b2*w0+b3*w1+b4*w2+b5*w3+b6*w4+[b7@x1[True=1]+b8*x1[True=2]+b9*x1[True=3]+b10*x0 + the inetreaction of all the x's
# with v0] + u

# Basically, here we assume that we observe all factors that might be realated with the treatment and with the outcome
# and that they affect (specificaty problem) linearly the outcome

df = dowhy.datasets.linear_dataset(
    beta=3,  # This is the real coefficient of the treament in the lineal model, effect of the treamtment on the outcome
    # Number of variables that affect both the treatment (are in the error e) and the outcome (cov(v0,Wi) not 0).
    # I guess is the ATE
    num_common_causes=5,
    # This are basically ommited variables, hence if you do y=b0+b1*v0+b2*w0+b3*w1+b4*w2+b5*w3+b6*w4+u via OLS, you got  that
    # Now the b1 consistely estimate b1!!! (which is the beta).
    # All W's are independent from one another
    # Causality :w->v0,w->y.
    # I beleive this are exogenous variables, i.e., once inside the regression, they are not related anymore with the error term...
    num_discrete_common_causes=1,
    # This says one of the W's is discrete
    num_instruments=5,
    # Number of correct instrumental variables for the treatment: Cov(v0, Zi)!=0, Cov(e, zi)=0. Hence, in the regression
    # y=b0+b1*v0+e, this variables are related in covariance with v0, but are in no way related in covariance with e (where in the e we got the W's)
    # Causality :z->v0->Y0. Notice that in standard regression the problem always is: error->v0->Y0<-error, hence now we find some
    # Z that just affects v0 and is not realeted to error...
    # num_frontdoor_variables=1,
    # Basically, this are kind of like IV, but with the difference that now you got the FD0 as a mechanism though which
    # v0 causes Y0. This are endogenous withing the model itself!!!
    # Hence, now you estimate: v0->FD0->Y0.They have their own regression and stuff. Appers to be good for some autoselection problems, though perhaps less than IV. Check:The Book of Why
    num_treatments=1,
    # This is the number of  treatment variable we are intrested in estimate.
    num_samples=1000,
    # num_effect_modifiers=2,
    # This are effect modifiers. Esentially, this is the case when the effect of the treatment depend on specific caracteristics
    # of the individual. Say, if one is a Man or a Woman. This is why, to estimate the B1 correctly, you must include the X's and the
    # interaction of the X's with the treatment v0 (not with W's, since they are exogenous once inside the equation and not realted
    # witn X never)
    # num_discrete_effect_modifiers=1,
    # This will make the effect modifiers take on categorical values, like {0,1,2,3}, hence, in the regresion due to perfect
    # multicolinearity problems, just include indicators variables for 3 of the cases. Patsy or Fomulaic will do this automatically for you
    treatment_is_binary=True,
    # Traeat binarity
    outcome_is_binary=False,
    # Outcome binarity
)

data = df["df"]
y = 33*data.v0+abs(5*data.W0**2+6*np.log(abs(data.W1))-0.9*data.W3**5/(data.W2+5)
                   )**abs(data.W1)-(5*data.W4.astype(float)-data.W2**6+6*data.W1*data.W2)/5


def flatten_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    df.loc[df[column] < lower_bound, column] = df[column].loc[(
        df[column] >= lower_bound) & (df[column] <= upper_bound)].iloc[0]
    df.loc[df[column] > upper_bound, column] = df[column].loc[(
        df[column] >= lower_bound) & (df[column] <= upper_bound)].iloc[-1]
    return df

# Example usage


y = flatten_outliers(pd.DataFrame(y), 0)

y.describe()
Z0 = data.Z0
Z1 = 33*data.Z1
new_data = data
new_data["y"] = y
new_data["Z0"] = Z0
new_data["Z1"] = Z1
df["df"] = new_data
data = df["df"]
data["v0"] = data["v0"].astype("int")
# Note: The real param Now is beta=33!!!
data.head(), data.v0.describe()

(   Z0      Z1      Z2      Z3     Z4      W0        W1        W2        W3     \
 0  1.0   3.782067  0.0  0.799931  1.0  0.300954  0.318778  0.950390 -0.465867   
 1  1.0  32.989599  0.0  0.421178  1.0 -0.678573 -1.163724  0.677711  0.319513   
 2  0.0  32.715236  0.0  0.348849  1.0  0.745811  0.920205  0.756560 -1.711820   
 3  0.0  12.876439  1.0  0.808515  1.0  0.651129 -0.525222  0.028904 -0.749755   
 4  1.0  29.343564  0.0  0.969060  1.0 -1.018517 -0.803864  1.558629 -0.225619   
 
   W4  v0      y      
 0  3   1  31.591271  
 1  2   1  35.853303  
 2  1   1  35.258663  
 3  1   1  33.340179  
 4  1   1  39.343054  ,
 count    1000.000000
 mean        0.991000
 std         0.094488
 min         0.000000
 25%         1.000000
 50%         1.000000
 75%         1.000000
 max         1.000000
 Name: v0, dtype: float64)

In [57]:
# Probit to take probs

import statsmodels.api as sm
from formulaic import model_matrix
treatment, controls = model_matrix("v0~W0+W1+W2+W3+W4", data)
probit = sm.Probit(treatment, controls).fit()
# predictions
pscore = probit.predict()
# add to previous data
new_data = pd.concat([data, pd.DataFrame(pscore)], axis=1)
new_data.columns.values[-1] = "pscore"

Optimization terminated successfully.
         Current function value: 0.038701
         Iterations 10


In [58]:
import dtale
from dataprep.eda import create_report
# create_report(data).show_browser()
# This works best on the web web. Is good, though i guess the 2 backwards is better

d = dtale.show(new_data.dropna())

tmp = d.data.copy()
tmp['d'] = 4
d.open_browser()
d._data_id  # The process's data identifier.
d._url  # The url to access the process.
# Returns a new reference to the instance running at that data_id.
d2 = dtale.get_instance(d._data_id)
# Prints a list of all ids & urls of running D-Tale sessions.
dtale.instances()

To gain access to an instance object simply pass the value from 'ID' to dtale.get_instance(ID)

ID Name                    URL                   
1       http://DESKTOP-UR5COVK:40000/dtale/main/1
        http://DESKTOP-UR5COVK:40000/dtale/main/1
2       http://DESKTOP-UR5COVK:40000/dtale/main/2
        http://DESKTOP-UR5COVK:40000/dtale/main/2


In [66]:
print(sm.OLS.from_formula("y~v0+pscore",data).fit(cov_type="hc3").summary())
print(sm.OLS.from_formula("y~v0+W0+W1+W2+W3+W4",data).fit(cov_type="hc3").summary())
print(sm.OLS.from_formula("y~v0+pscore+W0+W1+W2+W3+W4",data).fit(cov_type="hc3").summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.033
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     12.14
Date:                Mon, 11 Mar 2024   Prob (F-statistic):           6.15e-06
Time:                        07:11:08   Log-Likelihood:                -4114.1
No. Observations:                1000   AIC:                             8234.
Df Residuals:                     997   BIC:                             8249.
Df Model:                           2                                         
Covariance Type:                  hc3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     78.4866     30.968      2.534      0.0