In [None]:
import warnings
import os
from tqdm import tqdm
import pandas as pd
import random
import econml
from econml.dml import DML, LinearDML, SparseLinearDML, CausalForestDML
import numpy as np
import networkx as nx
from networkx.algorithms import tournament
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import (Lasso, LassoCV, LogisticRegression,
                                  LogisticRegressionCV, LinearRegression,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from dowhy import CausalModel
# import pgmpy
# from pgmpy.base import DAG
# from pgmpy.metrics import log_likelihood_score, correlation_score, structure_score
# from pgmpy.estimators import BDeuScore, BDsScore, BicScore, K2Score

import myUtils
from myUtils import *


In [None]:
def compute_ate(parent, child, data_df, ref_df, dg, T0, T1):
    parent_parents = list(dg.predecessors(parent))
    child_parents = list(dg.predecessors(child))
    X_cols = list(set(parent_parents + child_parents))

    if parent in X_cols:
        X_cols.remove(parent)
    X = data_df[X_cols]
    T = data_df[parent]
    Y = data_df[child]

    est = LinearDML(
        model_y=LinearRegression(),
        model_t=LinearRegression(),
        # featurizer=PolynomialFeatures(degree=1, include_bias=False),
        )
    est.fit(Y, T, X=X)
    ate = est.ate(X=ref_df[X_cols], T0=T0, T1=T1)

    return ate


In [None]:
graph_path = os.path.join(
)
with open(graph_path, "r") as fin:
    origin_gfile = fin.read().splitlines()

collected_df = pd.read_csv(
)
collected_df.replace([np.inf, -np.inf], np.nan, inplace=True)
collected_df.dropna(inplace=True)
collected_df.fillna(0, inplace=True)
collected_df = collected_df.sample(frac=1).reset_index(drop=True)
print(f"Collected data shape: {collected_df.shape}")
print(f"Collected data columns: {collected_df.columns}")
data_df = collected_df.copy()

instruction_cols = ["long", "short", "formal", "fluent", "technical", "logical"]
role_cols = ["student", "programmer", "competitor"]
scenario_cols = ["clearer", "improve", "specify"]
cm_cols = [
    "semgrep",
    "black",
    "syntaxError_rate",
    "sta_codeBleu",
    "sta_Bleu",
    "sim_codeBleu",
    "sim_Bleu",
    "pass_rate",
    "error_rate",
    "timeout_rate",
]

meta_cols = []
for c in instruction_cols:
    meta_cols.append("Inst_" + c)
for c in role_cols:
    meta_cols.append("Role_" + c)
for c in scenario_cols:
    meta_cols.append("Scen_" + c)

In [None]:
trace_list = []
for edge in origin_gfile:
    parent, child = edge.split(' -> ')
    trace_list.append((parent, child))

dg = nx.DiGraph()
dg.add_edges_from(trace_list)

# temp_graph = "digraph {" + \
#     ' '.join(origin_gfile) + "}"
sorted_nodes = list(nx.topological_sort(dg))
sorted_nodes.reverse()


In [None]:
meta_node = ""
cm_node = ""

all_anc = list(nx.ancestors(dg, cm_node))
for cm in cm_cols:
    if cm in all_anc:
        all_anc.remove(cm)

if len(all_anc) > 0:
    abs_ate = {}
    count = 0
    for node in tqdm(sorted_nodes):
        if node in all_anc:
            T0 = 0
            T1 = data_df[data_df[meta_node] == 1][cm_node].mean()
            ref_df = data_df[data_df[meta_node] == 0]
            try:
                ate = compute_ate(node, cm_node, data_df, ref_df, dg, T0, T1)
            except:
                continue
            else:
                abs_ate[node] = ate
                count += 1      

    sorted_ate = sorted(abs_ate.items(), key=lambda kv: abs(kv[1]), reverse=True)

    for anc, ate in sorted_ate:
        print(f"{anc} -> {cm_node}: {ate}")