In [None]:
import shutil
from pathlib import Path

import numpy as np
import pandas as pd
from asreview import ASReviewData, ASReviewProject, open_state
from asreview.models.balance import DoubleBalance
from asreview.models.classifiers import NaiveBayesClassifier
from asreview.models.feature_extraction import Tfidf
from asreview.models.query import MaxQuery
from asreview.review import ReviewSimulate

In [None]:
def pad_labels(labels, num_priors, num_records):
    return pd.Series(
        labels.tolist() + np.zeros(num_records - len(labels) - num_priors).tolist()
    )


def n_query_extreme(results, n_records):
    if n_records >= 10000:
        if len(results) >= 10000:
            return 10**5  # finish the run
        if len(results) >= 1000:
            return 1000
        elif len(results) >= 100:
            return 25
        else:
            return 1
    else:
        if len(results) >= 1000:
            return 100
        elif len(results) >= 100:
            return 5
        else:
            return 1

In [None]:
studies = pd.read_json("synergy_studies_full_val.jsonl", lines=True)
report_order = sorted(set(studies["dataset_id"]))

for dataset_name in report_order:
    dataset_studies = studies[studies["dataset_id"] == dataset_name]
    if dataset_name == "Moran_2021_corrected":
        file_path = "./datasets/Moran_2021_corrected_shuffled_raw.csv"
        data_obj = ASReviewData.from_file(file_path)
    else:
        file_path = f"./datasets/synergy_dataset/{dataset_name}.csv"
        data_obj = ASReviewData.from_file(file_path)

    dataset_studies = dataset_studies.head(5)
    for i, study in dataset_studies.iterrows():
        priors = study["prior_inclusions"] + study["prior_exclusions"]

        project_path = Path("asreview_old_tmp", f"{dataset_name}-{i}")
        project_path.mkdir(exist_ok=True, parents=True)

        project = ASReviewProject.create(
            project_path=project_path / "api_simulation",
            project_id="api_example",
            project_mode="simulate",
            project_name="api_example",
        )

        project.add_dataset("../../../../" + file_path)

        # Select models to use
        train_model = NaiveBayesClassifier()
        query_model = MaxQuery()
        balance_model = DoubleBalance()
        feature_model = Tfidf()

        # Initialize the simulation reviewer
        reviewer = ReviewSimulate(
            as_data=data_obj,
            model=train_model,
            query_model=query_model,
            balance_model=balance_model,
            feature_model=feature_model,
            n_instances=1,
            project=project,
            n_prior_included=len(study["prior_inclusions"]),
            n_prior_excluded=len(study["prior_exclusions"]),
            prior_indices=priors,
        )

        reviewer.review()
        # Finish and export the project, and cleanup files
        project.export(f"asreview_old_files/{dataset_name}-{i}.asreview")
        shutil.rmtree(project_path)

In [None]:
studies = pd.read_json("synergy_studies_full_val.jsonl", lines=True)
report_order = sorted(set(studies["dataset_id"]))

recalls_old = []

for dataset_name in report_order:
    dataset_studies = studies[studies["dataset_id"] == dataset_name]
    dataset_studies = dataset_studies.head(5)

    for i, study in dataset_studies.iterrows():
        if study["dataset_id"] == "Appenzeller-Herzog_2019" or study["dataset_id"] == "Bos_2018":
            priors = study["prior_inclusions"] + study["prior_exclusions"]

            with open_state(f"asreview_old_files/{dataset_name}-{i}.asreview") as state:
                df = state.get_dataset()
                num_records = len(df)
                df.drop(df[df["training_set"] < 0].index, axis=0, inplace=True)
                labels_old = pad_labels(
                    df["label"].reset_index(drop=True),
                    len(priors),
                    num_records,
                )
                recalls_old.append(labels_old.cumsum())

pd.DataFrame(recalls_old).to_csv("recalls_old_1.6.csv", index=False)

0       0
1       0
2       0
3       1
4       0
       ..
2865    0
2866    0
2867    0
2868    0
2869    0
Length: 2870, dtype: int64
0       0
1       1
2       1
3       0
4       0
       ..
2861    0
2862    0
2863    0
2864    0
2865    0
Length: 2866, dtype: int64
0       1
1       0
2       0
3       1
4       0
       ..
2862    0
2863    0
2864    0
2865    0
2866    0
Length: 2867, dtype: int64
0       0
1       0
2       0
3       0
4       0
       ..
2864    0
2865    0
2866    0
2867    0
2868    0
Length: 2869, dtype: int64
0       0
1       0
2       1
3       0
4       1
       ..
2864    0
2865    0
2866    0
2867    0
2868    0
Length: 2869, dtype: int64
0       0
1       0
2       0
3       0
4       0
       ..
4868    0
4869    0
4870    0
4871    0
4872    0
Length: 4873, dtype: int64
0       0
1       0
2       0
3       0
4       0
       ..
4870    0
4871    0
4872    0
4873    0
4874    0
Length: 4875, dtype: int64
0       0
1       0
2       0
3       1
4