In [1]:
import pickle
from pathlib import Path

import networkx as nx
import pandas as pd
from dowhy import CausalModel

from structure_data import choose_columns, preprocess_data

In [2]:
ROOT = Path('./')
RAW_DATA = ROOT / "raw_data/csv/eqls_2007and2011.csv"
DICT_PATH = ROOT / "data/dictionary.json"
GRAPH_PATH = ROOT / "graphs/full_causal.gpickle"
TREATMENT = "Y11_Q57"
OUTCOME = "Y11_MWIndex"
def _prep_ordinals_inplace(df: pd.DataFrame, ord_cols, known_orders=None):
    """
    Map ordinal levels to 0..m-1 and scale to [0,1] so L1 equals Gower per-feature.
    known_orders: optional dict {col: [lowest,...,highest]} to control ordering.
    """
    for c in ord_cols:
        if known_orders and c in known_orders:
            order = {v:i for i, v in enumerate(known_orders[c])}
            df[c] = df[c].map(order).astype(float)
            m = max(order.values()) if order else 1
            df[c] = df[c] / (m if m > 0 else 1.0)
        else:
            # infer: if numeric-coded, scale min–max; else sort unique labels
            if pd.api.types.is_numeric_dtype(df[c]):
                lo, hi = df[c].min(), df[c].max()
                rng = (hi - lo) if hi > lo else 1.0
                df[c] = (df[c] - lo) / rng
            else:
                levels = sorted(df[c].dropna().unique().tolist())
                mapping = {v:i for i, v in enumerate(levels)}
                m = max(mapping.values()) if mapping else 1
                df[c] = df[c].map(mapping).astype(float) / (m if m > 0 else 1.0)


def load_data() -> pd.DataFrame:
    """Load and preprocess the raw EQLS data."""
    bdvs = ['Y11_EmploymentStatus', 'Y11_HHstructure', 'Y11_HHsize', 'Y11_Agecategory', 'Y11_Q7', 'Y11_Q31', 'Y11_Country', 'Y11_Q32', 'Y11_HH2a', TREATMENT, OUTCOME]
    df = choose_columns()
    df = preprocess_data(
        df,
        na_threshold=0.5,
        impute_strategy="drop",
        treatment_dichotomize_value="median",
        treatment_column=TREATMENT,
        backdoor_variables=bdvs
    )
    df.to_csv("data/eqls_processed.csv", index=False)
    return df

def get_schema() -> dict:
    categorical = ['Y11_Q32', 'Y11_Q7']
    ordinal = ['Y11_Agecategory','Y11_Country','Y11_EmploymentStatus','Y11_HH2a','Y11_HHsize','Y11_HHstructure','Y11_Q31']
    return {
        'cat': categorical,
        'ord': ordinal
    }

def load_graph() -> nx.DiGraph:
    """Load the causal graph describing relationships among variables."""
    with open(GRAPH_PATH, "rb") as f:
        return pickle.load(f)


In [None]:

METHODS = [
    "backdoor.propensity_score_matching",
    "backdoor.propensity_score_weighting",
    "backdoor.propensity_score_stratification",
    "backdoor.linear_regression",
    "backdoor.distance_matching",
]
DEFAULT_KWARGS = {
        "backdoor.distance_matching": dict(
            target_units="ate",                    # or "att"/"atc"
            method_params={
                "distance_metric": "minkowski",  # L1 with feature weights
                "p": 1,
                "num_matches_per_unit": 1,        # change if you want m:1 matching
                "exact_match_cols": get_schema()['cat'],  # force exact match on pure categoricals
            },
        )
    }


def estimate_effects(model: CausalModel, df: pd.DataFrame, graph, methods: list = METHODS, kwargs: dict = DEFAULT_KWARGS) -> dict:
    _prep_ordinals_inplace(df, get_schema()['ord'])
    estimand = model.identify_effect()

    results = {}
    for m in methods:
        try:
            est = model.estimate_effect(
                estimand, method_name=m, **kwargs.get(m, {})
            )
            results[m] = float(est.value)
        except Exception as e:
            print(f"[!] Estimation with {m} failed: {e}")
            results[m] = float("nan")
    return results


In [None]:
df = load_data()
df2 = df.copy()
graph = load_graph()
model = CausalModel(
        data=df2,
        treatment=TREATMENT,    # must be binary {0,1}
        outcome=OUTCOME,
        graph=nx.nx_pydot.to_pydot(graph).to_string(),
    )
results = estimate_effects(model, df2, graph)

print("Estimation results (ATE):")
for name, val in results.items():
    print(f"  {name:<40} {val:.4f}")

[+] Treatment column: Y11_Q57
[+] Outcome column: Y11_MWIndex
[+] Covariate columns: ['Y11_Country', 'Y11_Q31', 'Y11_Agecategory', 'Y11_HH2a', 'Y11_HHstructure', 'Y11_Q42', 'Y11_Q18', 'Y11_Q44', 'Y11_Q32', 'Y11_HHsize', 'Y11_Accommproblems', 'Y11_Q40a', 'Y11_Q40b', 'Y11_Q40c', 'Y11_Q40d', 'Y11_Q40e', 'Y11_Q40f', 'Y11_Q40g', 'Y11_Q7', 'Y11_EmploymentStatus', 'Y11_RuralUrban', 'Y11_Strainbasedconflict', 'Y11_Q15', 'Y11_Q16', 'Y11_Q12a', 'Y11_Q50d', 'Y11_Q53a']
[+] Y11_Country: 0 NA values (0.00%)
[+] Y11_Q31: 514 NA values (0.65%)
[+] Y11_Agecategory: 0 NA values (0.00%)
[+] Y11_HH2a: 0 NA values (0.00%)
[+] Y11_HHstructure: 0 NA values (0.00%)
[+] Y11_Q42: 141 NA values (0.18%)
[+] Y11_Q18: 324 NA values (0.41%)
[+] Y11_Q44: 56769 NA values (71.61%)
[+] Y11_Q32: 501 NA values (0.63%)
[+] Y11_HHsize: 0 NA values (0.00%)
[+] Y11_Accommproblems: 748 NA values (0.94%)
[+] Y11_Q40a: 1181 NA values (1.49%)
[+] Y11_Q40b: 43159 NA values (54.45%)
[+] Y11_Q40c: 630 NA values (0.79%)
[+] Y11_Q40d

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  intercept_parameter = self.model.params[0]


Estimation results (ATE):
  backdoor.propensity_score_matching       6.3595
  backdoor.propensity_score_weighting      7.0995
  backdoor.propensity_score_stratification 6.8997
  backdoor.linear_regression               6.8726
  backdoor.distance_matching               6.2555
