In [1]:
%cd ..

/home/bhkuser/bhklab/katy/readii_2_roqc


In [None]:
import pandas as pd

from damply import dirs
from pathlib import Path
from readii_2_roqc.utils.loaders import load_dataset_config
from readii_2_roqc.utils.analysis import prediction_data_setup

In [None]:
def label_data_for_prediction(dataset:str,
                              signature:str,
                              split:str | None = None,
                              standard_event_label:str | None = None,
                              standard_time_label:str | None = None,
                              overwrite:bool = False
                              ) -> dict[str,pd.DataFrame]:
    """Set up and save out a dataframe of outcome labels and signature features for model prediction.
    """
    if not standard_event_label and not standard_time_label:
        message = "No outcome label specifications have been set. Must set either standard_event_label or standard_time_label to proceed."
        raise ValueError(message)

    # Load the dataset configuration file and set up the dataset name versions
    dataset_config, _dataset_name, full_data_name = load_dataset_config(dataset)

    # Get image types from results of feature extraction
    # two **/** in the pattern cover the feature type and image type processed
    feature_type = dataset_config['EXTRACTION']['METHOD']
    features = dataset_config['EXTRACTION']['CONFIG'].removesuffix('.yaml')
    image_type_feature_file_list = sorted(Path(dirs.RESULTS / full_data_name / "features").rglob(pattern = f"**/**/{features}/*_features.csv"))

    labelled_data = {}
    outcome_name = str(standard_event_label if standard_event_label else '') + str(f'_{standard_time_label}' if standard_time_label else '')
    for feature_file_path in image_type_feature_file_list:
        image_type = feature_file_path.name.removesuffix('_features.csv')
        
        feature_data, outcome_data = prediction_data_setup(dataset_config,
                                                           feature_file=feature_file_path,
                                                           signature_name=signature,
                                                           split=split,
                                                           standard_event_label=standard_event_label,
                                                           standard_time_label=standard_time_label)
        
        labels_and_features = pd.concat([outcome_data, feature_data], axis=1)

        # Set up path for and save out labelled feature data
        out_path = feature_file_path.parent.parent / "labelled" / f"{signature}_{outcome_name}" / f"{image_type}_labelled_features.csv"

        if not out_path.exists() or overwrite:
            out_path.parent.mkdir(parents=True, exist_ok=True)
            labels_and_features.to_csv(out_path, index=True)

        # Add this labelled data to the dictionary of image types
        labelled_data[image_type] = labels_and_features
    
    return labelled_data 

In [None]:
dataset = "RADCURE_windowed"
signature = "choi_opc_hpv_2020"
split = None

labelled_data = label_data_for_prediction(dataset, signature, split, standard_event_label='hpv_status')