In [45]:
import networkx as nx
from dowhy import CausalModel
import pandas as pd
import json
import pickle
from collections import defaultdict

In [46]:
# Build your graph_dict
graph_dict = {
    "Satisfaction": {"Y"},
    "Health": {"Education", "Employment", "Satisfaction", "Y"},
    "Living Environment": {"Satisfaction", "Y"},
    "Employment": {"Satisfaction", "Living Environment", "T"},
    "Education": {"Satisfaction", "Employment"},
    "Demographics": {
        "Satisfaction",
        "Health",
        "Living Environment",
        "Employment",
        "Education",
        "T",
        "Y",
    },
    "T": {"Y", "Living Environment", "Satisfaction"},
    "Y": set(),
}

all_nodes = set(graph_dict.keys())

# Build NetworkX DiGraph
G = nx.DiGraph()
for src, targets in graph_dict.items():
    for tgt in targets:
        G.add_edge(src, tgt)

# Create dummy data with correct column names
df = pd.DataFrame(columns=G.nodes)

# Convert to DOT string using pydot
dot_str = nx.nx_pydot.to_pydot(G).to_string()

In [47]:
data_dictionary = json.load(open("/home/paperspace/EQLS-Causal-Project/data/dictionary.json", "r"))

group_to_vars = defaultdict(list)
for var, info in data_dictionary.items():
    group = info.get("variable_group")
    if group:
        group_to_vars[group].append(var)

# Step 4: Build variable-level DAG according to cluster DAG
disconnected_within = nx.DiGraph()

# Add all variables as nodes
for var in data_dictionary:
    disconnected_within.add_node(var)

# For each edge Cluster_A -> Cluster_B in G,
# add edges var_A -> var_B for all var_A in Cluster_A and var_B in Cluster_B
for cluster_a, cluster_b in G.edges():
    for var_a in group_to_vars.get(cluster_a, []):
        for var_b in group_to_vars.get(cluster_b, []):
            if var_a != var_b:  # ensure no self-loop
                disconnected_within.add_edge(var_a, var_b)


# Remove ['Y11_Q44', 'Y11_Q40b', 'Y11_Strainbasedconflict', 'Y11_Q15', 'Y11_Q16', 'Y11_Q12a']
# As they have missing values in the dataset
for node in ['Y11_Q44', 'Y11_Q40b', 'Y11_Strainbasedconflict', 'Y11_Q15', 'Y11_Q16', 'Y11_Q12a']:
    if node in disconnected_within:
        disconnected_within.remove_node(node)

# Step 5: Verify acyclicity
assert nx.is_directed_acyclic_graph(disconnected_within), "Graph has cycles!"

In [48]:
# Feed into DoWhy
model = CausalModel(data=df, treatment="T", outcome="Y", graph=dot_str)

# Identify effect
identified_estimand = model.identify_effect()

print("Backdoor variables:", identified_estimand.backdoor_variables['backdoor'])
print("Instrumental variables:", identified_estimand.instrumental_variables)
print("Frontdoor variables:", identified_estimand.frontdoor_variables)

Backdoor variables: ['Demographics', 'Employment']
Instrumental variables: []
Frontdoor variables: []


In [49]:
# Visualize the graph
G_viz = nx.nx_agraph.to_agraph(G)
G_viz.draw("../figs/conceptual_causal_model.svg", prog='dot')

In [53]:
# Plot the graph using pygraphviz
disconnected_within_viz = nx.nx_agraph.to_agraph(disconnected_within)
disconnected_within_viz.draw("../figs/full_causal_model.svg", args='-Gsize=5 -Gratio=1.5 -Nfontsize=50', prog='dot')

In [51]:
CausalModel(data=pd.DataFrame(columns=disconnected_within.nodes),
            treatment="Y11_Q57",
            outcome="Y11_MWIndex",
            graph=nx.nx_pydot.to_pydot(disconnected_within).to_string()).identify_effect().backdoor_variables['backdoor']

['Y11_EmploymentStatus',
 'Y11_HHstructure',
 'Y11_HHsize',
 'Y11_Agecategory',
 'Y11_Q7',
 'Y11_Q31',
 'Y11_Country',
 'Y11_Q32',
 'Y11_HH2a']

In [52]:
with open('../graphs/full_causal.gpickle', 'wb') as f:
    pickle.dump(disconnected_within, f, pickle.HIGHEST_PROTOCOL)