In [None]:
import os
import shutil
import json
from ast import literal_eval
import pandas as pd

In [None]:
def subset_df_to_remove_transfac(df):
    subset_df = df[df.dataset_name != "rf2na_distillation_transfac"].copy()
    return subset_df

def subset_evaluation_set_to_outputs(df, output_df):
    subset_df = df[df.structure_path.isin(set(output_df.structure_path))].copy()
    return subset_df

def prepare_save_info_for_specificity(df):
    save_info = []
    for _, row in df.iterrows():
        ppm_paths = literal_eval(row.ppm_paths)
        ppm_ids = []
        # For each experimentally equivalent group of PPMs.
        for ppm_paths_group in ppm_paths:
            ppm_ids_group = []
            # For each PPM in the experimentally equivalent group.
            for ppm_path in ppm_paths_group:
                # Extract PPM source from path.
                if row.dataset_name == "rf2na_distillation_cis_bp":
                    ppm_source = "CIS-BP"
                elif row.dataset_name == "rcsb_cif_na":
                    if "jaspar" in ppm_path:
                        ppm_source = "JASPAR"
                    elif "H11MO" in ppm_path:
                        ppm_source = "HOCOMOCO"
                    else:
                        raise ValueError(f"Unknown PPM source for path {ppm_path}")
                else:
                    raise ValueError(f"Unknown PPM source for path {ppm_path}")
                
                # Extract PPM ID from filename.
                ppm_id = os.path.splitext(os.path.basename(ppm_path))[0]

                # Add the the experimentally equivalent group.
                ppm_ids_group.append(
                    (
                        ppm_source,
                        ppm_id
                    )
                )

            # Add the experimentally equivalent group to the full list.
            ppm_ids.append(tuple(ppm_ids_group))

        save_info.append(
            (
                row.id,
                ppm_ids
            )
        )
    return save_info

def write_json_file(path, contents):
    with open(path, mode = "w") as f:
        json.dump(contents, f, indent=4)

### Design Sets

In [None]:
design_train_df = pd.read_csv(
    "../data/datasets/design_dataset_v2/train.csv"
)
design_valid_df = pd.read_csv(
    "../data/datasets/design_dataset_v2/valid.csv"
)
design_test_df = pd.read_csv(
    "../data/datasets/design_dataset_v2/test.csv"
)

In [None]:
write_json_file(
    "./design_train.json",
    design_train_df["id"].tolist()
)
write_json_file(
    "./design_valid.json",
    design_valid_df["id"].tolist()
)
write_json_file(
    "./design_test.json",
    design_test_df["id"].tolist()
)

In [None]:
design_evaluation_valid_df = subset_evaluation_set_to_outputs(
    pd.read_csv(
        "../evaluation/evaluation_csvs/design_valid.csv"
    ),
    pd.read_csv(
    "../evaluation/evaluation_summaries/design_valid_plot.csv"
    )
)
design_evaluation_test_df = subset_evaluation_set_to_outputs(
    pd.read_csv(
        "../evaluation/evaluation_csvs/design_test.csv"
    ), 
    pd.read_csv(
        "../evaluation/evaluation_summaries/design_test_plot.csv"
    )
)
design_evaluation_rna_monomer_test_df = subset_evaluation_set_to_outputs(
    pd.read_csv(
        "../evaluation/evaluation_csvs/design_rna_monomer_test.csv"
    ),
    pd.read_csv(
    "../evaluation/evaluation_summaries/design_rna_monomer_test_plot.csv"
    )
)
design_evaluation_pseudoknot_test_df = subset_evaluation_set_to_outputs(
    pd.read_csv(
        "../evaluation/evaluation_csvs/design_pseudoknot_test.csv"
    ),
    pd.read_csv(
    "../evaluation/evaluation_summaries/design_pseudoknot_test_plot.csv"
    )
)

In [None]:
write_json_file(
    "design_evaluation_valid.json",
    design_evaluation_valid_df["id"].tolist()
)
write_json_file(
    "design_evaluation_test.json",
    design_evaluation_test_df["id"].tolist()
)
write_json_file(
    "design_evaluation_rna_monomer_test.json",
    design_evaluation_rna_monomer_test_df["id"].tolist()
)
write_json_file(
    "design_evaluation_pseudoknot_test.json",
    design_evaluation_pseudoknot_test_df["id"].tolist()
)

### Specificity Sets

In [None]:
specificity_train_df = subset_df_to_remove_transfac(
    pd.read_csv(
        "../data/datasets/specificity_dataset_v2/train.csv"
    )
)
specificity_valid_df = subset_df_to_remove_transfac(
    pd.read_csv(
        "../data/datasets/specificity_dataset_v2/valid.csv"
    )
)
specificity_test_df = subset_df_to_remove_transfac(
    pd.read_csv(
        "../data/datasets/specificity_dataset_v2/test.csv"
    )
)

In [None]:
print(specificity_train_df.dataset_name.unique())
print(specificity_valid_df.dataset_name.unique())
print(specificity_test_df.dataset_name.unique())

In [None]:
write_json_file(
    "./specificity_train.json",
    prepare_save_info_for_specificity(specificity_train_df)
)
write_json_file(
    "./specificity_valid.json",
    prepare_save_info_for_specificity(specificity_valid_df)
)
write_json_file(
    "./specificity_test.json",
    prepare_save_info_for_specificity(specificity_test_df)
)

In [None]:
specificity_evaluation_valid_df = subset_evaluation_set_to_outputs(
    subset_df_to_remove_transfac(
        pd.read_csv(
            "../evaluation/evaluation_csvs/specificity_valid.csv"
        )
    ),
    pd.read_csv(
        "../evaluation/evaluation_summaries/specificity_valid_plot.csv"
    )
)
specificity_evaluation_test_df = subset_evaluation_set_to_outputs(
    subset_df_to_remove_transfac(
        pd.read_csv(
            "../evaluation/evaluation_csvs/specificity_test.csv"
        )
    ),
    pd.read_csv(
        "../evaluation/evaluation_summaries/specificity_test_plot.csv"
    )
)

In [None]:
print(specificity_evaluation_valid_df.dataset_name.unique())
print(specificity_evaluation_test_df.dataset_name.unique())

In [None]:
write_json_file(
    "./specificity_evaluation_valid.json",
    prepare_save_info_for_specificity(specificity_evaluation_valid_df)
)
write_json_file(
    "./specificity_evaluation_test.json",
    prepare_save_info_for_specificity(specificity_evaluation_test_df)
)

In [None]:
test_distilation_structures_path = "./cis_bp_test_distillation_structures"
os.makedirs(test_distilation_structures_path, exist_ok=True)
for _, row in specificity_evaluation_test_df.iterrows():
    if row.dataset_name == "rf2na_distillation_cis_bp":
        src = row.structure_path
        dst = os.path.join(
            test_distilation_structures_path,
            os.path.basename(row.structure_path)
        )
        shutil.copyfile(src, dst)