In [1]:
%cd ..

/home/bhkuser/bhklab/katy/readii_2_roqc


In [54]:
from damply import dirs
import pandas as pd
from readii.process.subset import getPatientIntersectionDataframes, getOnlyPyradiomicsFeatures
from readii_2_roqc.analysis.predict import load_signature_config
from readii.io.loaders import loadFileToDataFrame
from readii_2_roqc.utils.loaders import load_dataset_config
from readii.process.label import (
    eventOutcomeColumnSetup,
    getPatientIdentifierLabel,
    timeOutcomeColumnSetup,
)

import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

In [61]:
dataset = "HEAD-NECK-RADIOMICS-HN1"
dataset_config, dataset_name, full_data_name = load_dataset_config(dataset)
extract_method = dataset_config['EXTRACTION']['METHOD']


clinical_data = loadFileToDataFrame(dirs.RAWDATA / full_data_name / "clinical" / dataset_config['CLINICAL']['FILE'])
feature_data = loadFileToDataFrame(dirs.RESULTS / full_data_name / "features" / dataset_config['EXTRACTION']['METHOD'] / Path(dataset_config['EXTRACTION']['CONFIG']).stem / "original_full_features.csv")
index_data = loadFileToDataFrame(dirs.PROCDATA / full_data_name / "features" / extract_method / f"{extract_method}_{dataset_name}_index.csv")

In [62]:
existing_pat_id = getPatientIdentifierLabel(clinical_data)

id_map = index_data["SampleID"]
id_map.index = [value[0:-5] for value in index_data["SampleID"]]
id_map = id_map.drop_duplicates()

clinical_data["SampleID"] = clinical_data[existing_pat_id].map(id_map)
# clinical_data["Status_bin"] = clinical_data["Status"].apply(lambda x: 1 if x == "Dead" else 0)

In [63]:
clinical_data.index = clinical_data['SampleID']
feature_data.index = feature_data['SampleID']
clinical_subset, feature_subset = getPatientIntersectionDataframes(clinical_data, feature_data, False, False)

# Volume prediction

In [64]:
from sksurv.linear_model import CoxPHSurvivalAnalysis

outcome_vars = dataset_config['CLINICAL']['OUTCOME_VARIABLES']

volumes = feature_data['original_shape_MeshVolume']
labels = pd.DataFrame()

event_status = clinical_subset[[outcome_vars['event_label']]]
event_time = clinical_subset[[outcome_vars['time_label']]]

labels['survival_event_bool'] = event_time.astype(bool)

if dataset_config['CLINICAL']['OUTCOME_VARIABLES']['convert_to_years']:
    labels['survival_time_years'] = np.round(event_status / 365, 4)



In [65]:
volumes

SampleID
HN1004_0000    121067.833333
HN1006_0001      4529.416667
HN1022_0002     35441.541667
HN1026_0003      4379.875000
HN1029_0004      7559.416667
                   ...      
HN1950_0132     18117.750000
HN1954_0133     29777.083333
HN1968_0134     12641.291667
HN1987_0135     77355.958333
HN1998_0136      6127.375000
Name: original_shape_MeshVolume, Length: 137, dtype: float64

In [66]:
X = volumes.to_numpy().reshape(-1,1)
y = labels.to_records(index=False)
model = CoxPHSurvivalAnalysis().fit(X,y).score(X, y)
model


np.float64(0.6803946803946804)

# HPV Filtering

In [67]:
opc_clinical_data = clinical_data[clinical_data["Ds Site"] == "Oropharynx"]

opc_hpv_clinical_data = opc_clinical_data.dropna(axis=0, subset=["SampleID", "HPV"])
opc_hpv_clinical_data = opc_hpv_clinical_data.set_index("SampleID", drop=True)


feature_data = feature_data.set_index("SampleID", drop=True)
feature_data = getOnlyPyradiomicsFeatures(feature_data)

KeyError: 'Ds Site'

In [None]:
clinical_subset, feature_subset = getPatientIntersectionDataframes(opc_hpv_clinical_data, feature_data, False, False)

In [None]:
outcome_series = clinical_subset['HPV'].apply(lambda x: 1 if x == "Yes, positive" else 0)

# Save out subset data

In [None]:
feature_subset.insert(0, "HPV_bin", outcome_series)

In [None]:
labelled_output_dir = Path(dirs.PROCDATA / "TCIA_RADCURE" / "features" / "labelled" / "HPV_pyradiomics")
labelled_output_dir.mkdir(parents=True, exist_ok=True)
feature_subset.to_csv( labelled_output_dir / "original_full_features.csv", index=True)

In [None]:
clinical_subset.to_csv(dirs.PROCDATA / "TCIA_RADCURE" / "clinical" / "RADCURE_Clinical_OPC_HPV_20250820.csv", index=True)

# Choi et al. signature

In [None]:
signature = load_signature_config(dirs.CONFIG / "signatures/choi_opc_hpv_2020.yaml")

In [None]:
signature

In [None]:
signature_feature_data = feature_subset[signature.index]

In [None]:
from sklearn.linear_model import LogisticRegression, SGD

model = LogisticRegression(solver='liblinear').fit(signature_feature_data, outcome_series)

In [None]:
model.score(signature_feature_data, outcome_series)

In [None]:
model.predict(signature_feature_data)