In [None]:
"""Check aggregation of marvin files is correct."""
import shutil
import tarfile

import pandas as pd
import plotly.graph_objects as go

from thesis.config import BLD, HPC_RES

jobid = 17190151

In [None]:
# Unzip HPC_RES / 17190151.tar.gz to temporary dir then combine all pkl files
# (pd.DataFrame) into a single pd.DataFrame

# Unzip
tar = tarfile.open(HPC_RES / f"{jobid}.tar.gz")

# Create temporary directory
tmpdir = HPC_RES / "tmp"

if tmpdir.exists():
    shutil.rmtree(tmpdir)

tmpdir.mkdir()

tar.extractall(tmpdir)

# Combine all pkl files
dfs = []
for pkl in tmpdir.rglob("*.pkl"):
    _df = pd.read_pickle(pkl)
    # get iteration: int before .pkl
    _df["iteration"] = int(pkl.stem.split("_")[-1])
    dfs.append(_df)

data = pd.concat(dfs, ignore_index=True)

shutil.rmtree(tmpdir)

In [None]:
data_plot = data

query_dict = {
    "confidence_interval": "subsampling",
    "shape_constraints": "none",
    "mte_monotone": "none",
    "monotone_response": "none",
}

query_str = " & ".join([f"{k} == '{v}'" for k, v in query_dict.items()])

data_plot = data_plot.query(query_str)

data_plot["late_complier"] = data_plot["y1_c"] - data_plot["y0_c"]

In [None]:
cols = ["sim_ci_lower", "sim_lower_bound", "true_lower_bound", "iteration"]

data_grouped = (
    data_plot.groupby(["num_obs", "late_complier"])[cols].mean().reset_index()
)

In [None]:
# Plot sim-ci_lower, sim_lower_bound, true_lower_bound against late_complier
fig = go.Figure()

cols_to_plot = ["sim_ci_lower", "sim_lower_bound", "true_lower_bound"]

num_obs_to_plot = [1_000, 10_000]

num_obs_to_dash = {
    1_000: "solid",
    10_000: "dash",
}

col_to_color = {
    "sim_ci_lower": "green",
    "sim_lower_bound": "blue",
    "true_lower_bound": "red",
}

for num_obs in num_obs_to_plot:
    for col in cols_to_plot:
        data_to_plot = data_grouped.query(f"num_obs == {num_obs}")
        fig.add_trace(
            go.Scatter(
                x=data_to_plot["late_complier"],
                y=data_to_plot[col],
                mode="lines+markers",
                line={"color": col_to_color[col], "dash": num_obs_to_dash[num_obs]},
                name=f"{col} ({num_obs})",
                legendgroup=f"{col}",
            ),
        )

fig.update_layout(
    title="Simulated vs True Lower Bound",
    xaxis_title="Late Complier",
    yaxis_title="Lower Bound",
)

fig.show()

In [None]:
# Calculate covergae for true parameter
data_plot["coverage"] = data_plot["sim_ci_lower"] <= data_plot["true_lower_bound"]

data_grouped = (
    data_plot.groupby(["num_obs", "late_complier"])["coverage"].mean().reset_index()
)

fig = go.Figure()

for num_obs in num_obs_to_plot:
    data_to_plot = data_grouped.query(f"num_obs == {num_obs}")
    fig.add_trace(
        go.Scatter(
            x=data_to_plot["late_complier"],
            y=data_to_plot["coverage"],
            mode="lines+markers",
            line={"dash": num_obs_to_dash[num_obs]},
            name=f"Coverage ({num_obs})",
        ),
    )

fig.update_layout(
    title="Coverage",
    xaxis_title="Late Complier",
    yaxis_title="Coverage",
)

fig.show()

In [None]:
# Now do the same plot using the aggregated file
file = BLD / "data" / "pyvmte_simulations" / "combined.pkl"

data = pd.read_pickle(file)
data = data.query(query_str)

data = data[data["lp_tolerance"] == "1/n"]

In [None]:
# Plot sim-ci_lower, sim_lower_bound, true_lower_bound against late_complier
fig = go.Figure()

for num_obs in num_obs_to_plot:
    for col in cols_to_plot:
        data_to_plot = data.query(f"num_obs == {num_obs}")
        fig.add_trace(
            go.Scatter(
                x=data_to_plot["late_complier"],
                y=data_to_plot[col],
                mode="lines+markers",
                line={"color": col_to_color[col], "dash": num_obs_to_dash[num_obs]},
                name=f"{col} ({num_obs})",
                legendgroup=f"{col}",
            ),
        )

fig.update_layout(
    title="Simulated vs True Lower Bound",
    xaxis_title="Late Complier",
    yaxis_title="Lower Bound",
)

fig.show()