# Cases for optimization

This is a sample of 1000 studies with geometric distributed numbers of prior knowledge for both inclusions and exclusions. The number of prior knowledge is always equal or less than the number of inclusions of exclusions in the dataset minus 1. This ensures that the task is not already complete before it starts. 

In [None]:
import synergy_dataset as sd
import numpy as np

In [None]:
N_STUDIES = 1000

In [None]:
synergy_stats = []

for dataset in sd.iter_datasets():

    df = dataset.to_frame().reset_index()

    synergy_stats.append(
        (
            dataset.name,
            dataset.metadata["data"]["n_records_included"],
            dataset.metadata["data"]["n_records"] - dataset.metadata["data"]["n_records_included"],
            df["label_included"][df["label_included"] == 1].index.values,
            df["label_included"][df["label_included"] == 0].index.values
        )
    )

synergy_stats

In [None]:
rng = np.random.default_rng(535165)

studies = [
    {
        "dataset_id": synergy_stats[s][0],
        "prior_inclusions": rng.choice(
            synergy_stats[s][3],
            size=min(rng.geometric(p=0.5, size=1)[0], synergy_stats[s][1] - 1),
            replace=False,
        ).tolist(),
        "prior_exclusions": rng.choice(
            synergy_stats[s][4],
            size=min(rng.geometric(p=0.5, size=1)[0], synergy_stats[s][2] - 1),
            replace=False,
        ).tolist(),
    }
    for s in rng.choice(np.arange(len(synergy_stats)), N_STUDIES)
]

In [None]:
# write the results to a json line delimited file

import json

with open(f"synergy_studies_{N_STUDIES}.jsonl", "w") as f:
    for study in studies:
        f.write(json.dumps(study) + "\n")
