"""
Copyright 2026 Zsolt Bedőházi, András M. Biricz

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

### Load training set's biopsy information

In [None]:
# goal: create slide_id : cancer stage mapping
slide_biop_df = pd.read_csv(
    "<YOUR_PATH>/v2.1/biopsy-slides.csv"
)
outcomes_df = pd.read_csv("<YOUR_PATH>/v2.1/outcomes.csv")
slide_stage_df = slide_biop_df[['slide_id', 'biopsy_id', 'slide_path']].merge(outcomes_df, on="biopsy_id")

# map cancer stage to 0 - 4:
# outcomes_df["stage"].unique()
#     ['IA', 'IIB', 'IIA', '0', nan, 'IIIC', 'IV', 'IIIA', 'IIIB', 'IB']
def stage_to_int(stage):
    if stage == "0":
        return 0
    elif stage == "IA" or stage == "IB":
        return 1
    elif stage == "IIA" or stage == "IIB":
        return 2
    elif stage == "IIIA" or stage == "IIIB" or stage == "IIIC":
        return 3
    elif stage == "IV":
        return 4
    else:
        return np.nan


slide_stage_df["stage"] = slide_stage_df["stage"].apply(stage_to_int)

# subset columns, drop nans, reset index
labels_df = (
    slide_stage_df[["patient_ngsci_id", "slide_id", "biopsy_id", "stage"]]
    .copy()
    .dropna(how="any")
    .reset_index(drop=True)
)
labels_df["stage"] = labels_df["stage"].astype(int)

sort_idx = np.argsort( labels_df.slide_id.values )
labels_df = labels_df.loc[sort_idx]
labels_df.reset_index(inplace=True, drop=True)

labels_df.head(5)

In [None]:
np.unique( labels_df.patient_ngsci_id ).shape

In [None]:
np.unique( labels_df.biopsy_id ).shape

In [None]:
gby_temp = list(labels_df.groupby('biopsy_id',sort=False))
biopsy_idx_to_slide_idx = np.array( [ l[1].index.values for l in gby_temp ], dtype=object )

In [None]:
biopsy_idx_to_slide_idx[:3]

In [None]:
biopsy_df_1 = labels_df.groupby(["biopsy_id"], sort=False).agg({'stage': lambda x: x.tolist()[0]}).reset_index()
biopsy_df_1.head()

In [None]:
biopsy_df_2 = labels_df.groupby(["biopsy_id"], sort=False).agg({'patient_ngsci_id': lambda x: x.tolist()[0]}).reset_index()
biopsy_df_2.head()

In [None]:
biopsy_df_3 = labels_df.groupby(["biopsy_id"], sort=False).agg({'slide_id': lambda x: x.tolist()}).reset_index()
biopsy_df_3.head()

In [None]:
biopsy_df = biopsy_df_1.merge(right=biopsy_df_2, on=["biopsy_id"])
biopsy_df.shape

In [None]:
biopsy_df = biopsy_df.merge(right=biopsy_df_3, on=["biopsy_id"])
biopsy_df.shape

In [None]:
biopsy_df.head()

## Locate metadata files

In [None]:
root_dir = '<YOUR_PATH>/v2.1/'

In [None]:
files_csv = sorted( os.listdir(root_dir) )
files_csv

## Cancer dx csv

In [None]:
print(files_csv[1])
cancer_dx_df = pd.read_csv(root_dir+files_csv[1])
cancer_dx_df.shape

In [None]:
cancer_dx_df.isna().values.sum() # check for NaNs

In [None]:
cancer_dx_df.head()

In [None]:
cancer_dx_df_biopsy_1 = pd.merge( left=cancer_dx_df.groupby(["patient_ngsci_id"], sort=False).agg({'icd9': lambda x: x.tolist()}).reset_index(),
                                right=cancer_dx_df.groupby(["patient_ngsci_id"], sort=False).agg({'dx_dt': lambda x: x.tolist()}).reset_index() )
cancer_dx_df_biopsy_1.head()

In [None]:
cancer_dx_df_biopsy_2 = pd.merge( left=cancer_dx_df.groupby(["patient_ngsci_id"], sort=False).agg({'icd10': lambda x: x.tolist()}).reset_index(),
                                right=cancer_dx_df.groupby(["patient_ngsci_id"], sort=False).agg({'dx_dt': lambda x: x.tolist()}).reset_index() )
cancer_dx_df_biopsy_2.head()

In [None]:
cancer_dx_df_biopsy = cancer_dx_df_biopsy_1.merge(right=cancer_dx_df_biopsy_2[['patient_ngsci_id', 'icd10']], on=["patient_ngsci_id"])
cancer_dx_df_biopsy.shape

In [None]:
cancer_dx_df_biopsy.head()

In [None]:
np.array( cancer_dx_df_biopsy.iloc[0]['icd9'] ).shape, np.array( cancer_dx_df_biopsy.iloc[0]['dx_dt'] ).shape

## Comorbidities csv

In [None]:
print(files_csv[4])
comorbidities_df = pd.read_csv(root_dir+files_csv[4])
comorbidities_df.shape

In [None]:
comorbidities_df.isna().values.sum() # check for NaNs

In [None]:
comorbidities_df.head(3)

#### Takeaways:

- cancer and metastatic cancer can be target variables 
- others can be input for a trial fit or can be taken into account to make cv splits homogeneously

## Demographics csv

In [None]:
print(files_csv[5])
demographics_df = pd.read_csv(root_dir+files_csv[5])
demographics_df.shape

In [None]:
demographics_df.isna().values.sum() # check for NaNs0

In [None]:
plt.pcolormesh( demographics_df.isna().values )

In [None]:
demographics_df.iloc[:,:4].isna().values.sum() # check for NaNs

In [None]:
demographics_df.head()

In [None]:
uqs, cs = np.unique( demographics_df.ethnicity.values, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( demographics_df.race.values, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( demographics_df.sex.values, return_counts=True )
uqs, cs

#### Takeaways:

- These can be taken into account to make cv splits homogeneously

## Outcomes

In [None]:
print(files_csv[7])
outcomes_df = pd.read_csv(root_dir+files_csv[7])
outcomes_df.shape

In [None]:
outcomes_df.head()

In [None]:
outcomes_df.isna().values.sum(0) # check for NaNs

In [None]:
outcomes_df.death_dt.isna().sum() # check for NaNs

In [None]:
outcomes_df.strict_metastatic_dx_dt.isna().sum() # check for NaNs

In [None]:
outcomes_df.stage.isna().values.sum(0) # check for NaNs

In [None]:
uqs, cs = np.unique( outcomes_df.strict_metastatic_dx.values, return_counts=True )
uqs, cs # when there is a metastatic dx then there is a date for that otherwise NaN

In [None]:
outcomes_df.strict_metastatic_dx_dt[ outcomes_df.strict_metastatic_dx.values > 0 ]

In [None]:
np.unique( outcomes_df.stage.values.astype(str) )

In [None]:
uqs, cs = np.unique( (outcomes_df.stage[ outcomes_df.strict_metastatic_dx.values > 0 ]).astype(str), return_counts=True )
uqs, cs

## Pathology items 

In [None]:
print(files_csv[8])
pathology_df = pd.read_csv(root_dir+files_csv[8])
pathology_df.shape

In [None]:
pathology_df.head(2)

In [None]:
list(pathology_df), pathology_df.isna().values.sum(0) # check for NaNs 

In [None]:
pathology_df.grade_clinical.unique()

In [None]:
pathology_df.grade_pathological.unique()

In [None]:
uqs, cs = np.unique( pathology_df.er_summary.values, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( pathology_df.pr_summary.values, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( pathology_df.her2_summary.values, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( pathology_df.multigene_signature_method.values, return_counts=True )
uqs, cs

In [None]:
pathology_df.multigene_signature_result.unique()

In [None]:
uqs, cs = np.unique( pathology_df.response_neoadjuv_therapy, return_counts=True )
uqs, cs

## Social determinants

In [None]:
print(files_csv[9])
social_df = pd.read_csv(root_dir+files_csv[9])
social_df.shape

In [None]:
social_df.head()

In [None]:
social_df.isna().values.sum(0) # check for NaNs

## Treatments

In [None]:
print(files_csv[10])
treatments_df = pd.read_csv(root_dir+files_csv[10])
treatments_df.shape

In [None]:
treatments_df.head(3)

In [None]:
pd.DataFrame( treatments_df.isna().values.sum(0).reshape(1,-1), columns=list(treatments_df), ) # check for NaNs

In [None]:
uqs, cs = np.unique( treatments_df.most_definitive_surgical_procedure_cd, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( treatments_df.most_definitive_radiation_modality_cd, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( treatments_df.surgical_margin_cd, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( treatments_df.radiation_summ_cd, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( treatments_df.chemo_summ_cd, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( treatments_df.immuno_therapy_cd, return_counts=True )
uqs, cs

In [None]:
uqs, cs = np.unique( treatments_df.hormone_summ_cd, return_counts=True )
uqs, cs

## Merge some info together

#### cancer dx

In [None]:
merged_df =  biopsy_df.merge( cancer_dx_df_biopsy, 
                              how='left',
                              on='patient_ngsci_id' )
merged_df.shape

In [None]:
merged_df.head(2)

#### comorbidities

In [None]:
merged_df =  merged_df.merge( comorbidities_df[['biopsy_id', 'cancer', 'metastatic_cancer']], 
                              on='biopsy_id' )
merged_df.shape

In [None]:
merged_df.head(2)

#### demographics

In [None]:
merged_df =  merged_df.merge( demographics_df[['biopsy_id', 'race', 'ethnicity', 'birth_dt']],
                              on='biopsy_id' )
merged_df.shape

In [None]:
merged_df.head(2)

#### outcomes

In [None]:
merged_df =  merged_df.merge( outcomes_df[['biopsy_id', 'case_year', 'biopsy_dt', 'mortality', 'death_dt', 'strict_metastatic_dx', 'strict_metastatic_dx_dt']], 
                              on='biopsy_id' )
merged_df.shape

In [None]:
merged_df.head(2)

In [None]:
merged_df =  merged_df.merge( pathology_df[ ['biopsy_id',
 'grade_clinical',
 'grade_pathological',
 'er_summary',
 'pr_summary',
 'her2_summary',
 'multigene_signature_method',
 'multigene_signature_result',
 'response_neoadjuv_therapy'] ], on='biopsy_id'  )
merged_df.shape

In [None]:
merged_df.head(2)

In [None]:
treatments_df.head(2)

In [None]:
merged_df.head(2)

In [None]:
merged_df.sort_values('patient_ngsci_id', inplace=True)
merged_df.reset_index(drop=True, inplace=True)

In [None]:
merged_df.head(2)

In [None]:
merged_df =  merged_df.merge( treatments_df[['biopsy_id',
 'cancer_registry_dx_dt',
 'most_definitive_surgical_procedure_cd',
 'most_definitive_radiation_modality_cd',
 'surgical_margin_cd',
 'radiation_summ_cd',
 'chemo_summ_cd',
 'immuno_therapy_cd',
 'hormone_summ_cd',
 'rx_dx_stg_proc_dt',
 'rx_mst_defn_srg_dt',
 'first_surgery_dt',
 'radiation_start_dt',
 'rx_chemo_dt',
 'rx_hormone_dt',
 'stg_dx_summ_cd']] , on='biopsy_id'  )
merged_df.shape

In [None]:
merged_df =  merged_df.merge( social_df[['biopsy_id', 'bmi', 'tobacco']], on='biopsy_id'  )
merged_df.shape

In [None]:
merged_df.head(2)

In [None]:
len(merged_df.loc[0].slide_id), len(merged_df.loc[0].icd10)

### extract age info: birth date -> age at case year (first diagnosis)

In [None]:
birth_dates = merged_df.birth_dt.values.astype(str)
birth_dates

In [None]:
birth_dates = merged_df.birth_dt.values.astype(str)
birth_dates[ birth_dates == 'nan' ] = '0100-00-00' # set to this to mark NaNs with 0
birth_dates = np.array( [ b.split('-')[0] for b in birth_dates ] ).astype(int)-100 # shift back
age = merged_df.case_year.values - birth_dates
age_filt = age > 100
age[age_filt] = 0 # nan to be zero !
merged_df['age'] = age
plt.hist( age, bins=100 )
age.shape

In [None]:
merged_df.to_csv( 'merged_metadata_v2.1.csv', index=False )

In [None]:
merged_df.shape