# Cases for optimization

This is a sample of 1000 studies with geometric (0.5) distributed numbers of prior knowledge for both inclusions and exclusions. The number of prior knowledge is always equal or less than the number of inclusions or exclusions in the dataset minus 1. This ensures that the task is not already complete before it starts. 

In [1]:
import json

import synergy_dataset as sd
import numpy as np
import pandas as pd

In [2]:
N = 10

In [3]:
# create corrected moran dataset

# import pandas as pd

# df_shuffle = pd.read_excel("Moran_corrected.xlsx").sample(frac=1, random_state=535)
# df_shuffle["record_id"] = np.arange(len(df_shuffle))
# df_shuffle.to_csv("Moran_corrected.csv", index=False)

In [4]:
synergy_stats = {}

for dataset in sd.iter_datasets():
    # corrections
    if dataset.name == "Chou_2004":
        continue

    if dataset.name == "Moran_2021":
        df_moran = pd.read_csv("Moran_2021_corrected.csv")
        synergy_stats["Moran_2021_corrected"] = {
            "inclusions": df_moran[df_moran == 1].index.values,
            "exclusions": df_moran[df_moran == 0].index.values,
        }
        continue

    df = dataset.to_frame().reset_index()

    synergy_stats[dataset.name] = {
        "inclusions": df["label_included"][df["label_included"] == 1].index.values,
        "exclusions": df["label_included"][df["label_included"] == 0].index.values,
    }

## Sample the full study

In [5]:
rng = np.random.default_rng(535)

studies = [
    {
        "dataset_id": str(s),
        "prior_inclusions": rng.choice(
            synergy_stats[s]["inclusions"],
            size=min(rng.geometric(p=0.5, size=1)[0], len(synergy_stats[s]["inclusions"]) - 1),
            replace=False,
        ).tolist(),
        "prior_exclusions": rng.choice(
            synergy_stats[s]["exclusions"],
            size=min(rng.geometric(p=0.5, size=1)[0], len(synergy_stats[s]["exclusions"]) - 1),
            replace=False,
        ).tolist(),
    }
    for s in rng.choice(list(synergy_stats.keys()), len(synergy_stats) * N)
]

# studies

In [6]:
# write the results to a json line delimited file

with open("synergy_studies_full.jsonl", "w") as f:
    for study in studies:
        f.write(json.dumps(study) + "\n")

## Sample the small study

In [7]:
rng = np.random.default_rng(165)

studies = [
    {
        "dataset_id": str(s),
        "prior_inclusions": rng.choice(
            synergy_stats[s]["inclusions"],
            size=min(rng.geometric(p=0.5, size=1)[0], len(synergy_stats[s]["inclusions"]) - 1),
            replace=False,
        ).tolist(),
        "prior_exclusions": rng.choice(
            synergy_stats[s]["exclusions"],
            size=min(rng.geometric(p=0.5, size=1)[0], len(synergy_stats[s]["exclusions"]) - 1),
            replace=False,
        ).tolist(),
    }
    for s in [s for s in synergy_stats] * 2
]

In [8]:
# write the results to a json line delimited file

with open("synergy_studies_demo.jsonl", "w") as f:
    for study in studies:
        f.write(json.dumps(study) + "\n")