In [53]:
from tempfile import TemporaryDirectory

import mlflow
import mlflow.artifacts
import numpy as np
import pandas as pd

import cate.dataset as cds
from cate.infra.mlflow import REMOTE_TRACKING_URI
from cate.utils import path_linker


In [52]:
mlflow.set_tracking_uri(REMOTE_TRACKING_URI)

In [35]:
stats = {}

In [10]:
dataset_names = ("criteo", "hillstorm", "lenta", "megafon")
ds_list = [cds.Dataset.load(path_linker(name).mart) for name in dataset_names]

In [36]:
stats["count"] = [len(ds) for ds in ds_list]
stats["explanatory_count"] = [ds.X.shape[1] for ds in ds_list]
stats["conversion_rate"] = [np.mean(ds.y) for ds in ds_list]
stats["tg_rate"] = [np.mean(ds.w) for ds in ds_list]

In [37]:
ates = []
for ds in ds_list:
    ate = np.mean(ds.y[ds.w == 1]) - np.mean(ds.y[ds.w == 0])
    ates.append(ate)
stats["ate"] = ates

In [42]:
df = pd.DataFrame(stats, index=dataset_names)

In [47]:
df.to_csv("/workspace/output/datasets_stats/base_stats.csv")

In [None]:
run_ids = []

In [None]:
dfs = []
with TemporaryDirectory() as tmp_dir:
    for run_id in run_ids:
        mlflow.artifacts.download_artifacts(
            run_id=run_id, artifact_path="outputs.json", dst_path=tmp_dir
        )
        df = pd.read_json(f"{tmp_dir}/outputs.json")
        dfs.append(df)
