In [1]:
#! python3
# Library of functions to load metadata from various places, and merge into unified patient, biosample tables.
# Usage: generate_biosample_table()
# Requires: 
# 
# Owen Chapman

import pandas as pd
import pathlib
import os

## Function to load metadata from the AmpliconClassifier results
## Get this file from /expanse/lustre/projects/csd677/collab/projects/pedpancan/AmpliconClassifier/batch/inputs
def get_pedpancan_biosamples_from_AC(include_x01=False,path='pedpancan_summary_map.txt'):
    path = pathlib.Path(path)
    df = pd.read_csv(path, sep='\t', header=None, index_col=0, names = ["biosample","file"])
    if not include_x01:
        df['firstletter']=df.index.map(lambda x: x[0])
        df = df[df.firstletter != 'P']
    return df.index
get_pedpancan_biosamples_from_AC()

Index(['BS_85Y70BHE', 'BS_A9S5HT6P', 'BS_1EQGGHHQ', 'BS_BR1X3R68',
       'BS_JDMM2XG4', 'BS_859AV1DB', 'BS_RCC7ZEC0', 'BS_9KDA3HG6',
       'BS_E0S2Y0TS', 'BS_1C4WCNQF',
       ...
       'SJOS001101_M1', 'SJOS001130_D1', 'SJOS010_D', 'SJOS012_D',
       'SJOS030876_D1', 'SJRHB012_D', 'SJST030043_D1', 'SJST030131_D1',
       'SJST030383_D1', 'SJST030890_D1'],
      dtype='object', name='biosample', length=1711)

In [2]:
## Functions to load metadata from the CAVATICA API. 
## See also 2023-11-27_cavatica-api/cavatica-api.ipynb
def import_x01_biosample_metadata(path="X01-biosample-metadata.tsv"):
    path = pathlib.Path(path)
    df = pd.read_csv(path, sep='\t',index_col=0)
    df["cohort"]="PBTA-X01"
    return df
def import_x00_biosample_metadata(path="X00-biosample-metadata.tsv"):
    df = import_x01_biosample_metadata(path)
    df["cohort"]="PBTA-X00"
    return df
def import_pnoc_biosample_metadata(path="PNOC-biosample-metadata.tsv"):
    df = import_x01_biosample_metadata(path)
    df["cohort"]="PNOC"
    return df
df = import_x01_biosample_metadata()

In [3]:
def clean_cavatica_biosample_metadata(df):
    '''
    Clean known errors in the x01 metadata, and unify ontologies.
    '''
    # remove suffix from pnoc sample ids
    df.sample_id = df.sample_id.map(lambda x: '-'.join(x.split('-')[:2]) if x.startswith("7316-") else x)
    
    df = df.replace({
        'Tumor Descriptor':{
            "initial CNS Tumor": "Diagnosis",
            "Not Applicable":None,
            "Unavailable":None,
            "Initial CNS Tumor": "Diagnosis",
            "Progressive Disease Post-Mortem":"Progressive",
        },
        'gender':{
            "Not Reported":None
        }
    })
    # Correct suspected errors
    df.loc["BS_6Z213H2V","Tumor Descriptor"] = "Progressive" # This tumor was resected 175 days after the initial tumor resection.
    df.loc["BS_1135HC0V","Tumor Descriptor"] = "Second Malignancy" # This dysplasia was diagnosed 574 days after the first tumor was resected, in a new location.
    df.loc["BS_ZS1QRMXS","Tumor Descriptor"] = "Progressive" # Tumor resected 128 days after previous resection.
    df.loc["BS_FVYBGMG1","Tumor Descriptor"] = "Progressive" # Tumor resected 107 days after previous resection.
    df.loc["BS_5J5VH3X0","Tumor Descriptor"] = "Progressive" # Biopsied 240 days after previous biopsy.
    df.loc["BS_E9M7TDB6","Tumor Descriptor"] = "Progressive" # Second resection in different location 112 days after previous resection.
    df.loc["BS_5XZP7F4Q","Tumor Descriptor"] = "Progressive" # Second resection 1845 days after previous
    df.loc["BS_EXTEGB51","Tumor Descriptor"] = "Progressive" # Third resection 1922 days after previous
    df.loc["BS_93BV8AY9","Tumor Descriptor"] = "Second Malignancy" # Second diagnosis 2975 days after initial.
    df.loc["BS_CRKBDAYZ","Tumor Descriptor"] = "Progressive" # Series of progressive diagnoses long after initial.
    df.loc["BS_B4DY7ET3","Tumor Descriptor"] = "Progressive" # Second resection 119 days after previous.
    return df 
df = clean_cavatica_biosample_metadata(df)

In [4]:
## Function to compile CAVATICA metatdata for all CBTN samples in our cohort.
def import_cbtn_biosample_metadata(include_X01=False):
    if include_X01:
        df = pd.concat([import_x00_biosample_metadata(),import_x01_biosample_metadata(),import_pnoc_biosample_metadata()])
    else:
        df = pd.concat([import_x00_biosample_metadata(),import_pnoc_biosample_metadata()])
    cohort = get_pedpancan_biosamples_from_AC()
    df = df[df.index.isin(cohort)]
    df = clean_cavatica_biosample_metadata(df)
    return df
import_cbtn_biosample_metadata()

Unnamed: 0_level_0,gender,race,ethnicity,Kids First Participant ID,disease_type,sample_id,Tumor Descriptor,primary_site,age_at_diagnosis,Kids First Biospecimen ID Normal,case_id,WGS_UUID,cohort
Kids First Biospecimen ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
BS_K07KNTFY,Male,White,Not Hispanic or Latino,PT_00G007DM,Other,7316-272,Diagnosis,Cerebellum/Posterior Fossa,464,,C27429,7b06bec0-c64a-4df4-8237-d07de0a5df42,PBTA-X00
BS_AQMKA8NC,Male,White,Not Hispanic or Latino,PT_00G007DM,Other,7316-2577,Recurrence,Cerebellum/Posterior Fossa,3178,,C27429,9ba06668-759e-440c-a22f-957df52dae06,PBTA-X00
BS_TE8QFF7T,Male,White,Not Hispanic or Latino,PT_01MZ62KG,Pineoblastoma,7316-447,Progressive,Cerebellum/Posterior Fossa,546,,C92004,bd68f356-781e-49d8-a5ff-ec6c12514dc1,PBTA-X00
BS_2VB8649B,Male,Black or African American,Not Available,PT_02J5CWN5,Low-grade glioma/astrocytoma (WHO grade I/II),7316-2989,Diagnosis,Suprasellar/Hypothalamic/Pituitary,2223,,C90159,3cdb9fca-94cf-4938-9d20-cd1b0c59f6a0,PBTA-X00
BS_5TT6TT4K,Male,Black or African American,Not Available,PT_02J5CWN5,Low-grade glioma/astrocytoma (WHO grade I/II),7316-898,Progressive,Suprasellar/Hypothalamic/Pituitary,3722,,C90159,2c620221-1561-4073-8558-f81e4b3c9dcc,PBTA-X00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
BS_VXDGXQKZ,Female,Reported Unknown,Reported Unknown,PT_VPEMAQBN,Anaplastic Astrocytoma,7316-3235,Diagnosis,Brain Stem,1709,,C3078444,7ad281cf-82c0-46b9-9a23-8348d623ba77,PNOC
BS_38CD519Z,Male,Asian,Hispanic or Latino,PT_W5GP3F6B,Diffuse midline glioma; H3K27M mutant; WHO gra...,7316-5003,Diagnosis,Brain Stem,2459,,C3092712,80fd23ab-de59-45da-80f6-49af162fe982,PNOC
BS_4DQAQFQH,Female,Reported Unknown,Hispanic or Latino,PT_WGVEF96B,Diffuse Astrocytoma,7316-4446,Diagnosis,Brain Stem,2853,,C3080043,3d9ec140-cf1f-48e0-b4bd-d9ae775b29b0,PNOC
BS_TQ0J7WJQ,Female,Reported Unknown,Hispanic or Latino,PT_WGVEF96B,Diffuse Astrocytoma,7316-3219,Diagnosis,Brain Stem,2853,,C3080043,c2e040bd-64b7-4928-8428-1da909cb2873,PNOC


In [5]:
## Functions to open & preprocess opentarget histology data.
## Get histologies.tsv from https://github.com/d3b-center/OpenPedCan-analysis/blob/dev/analyses/molecular-subtyping-integrate/results/histologies.tsv
df = pd.read_csv("pedpancan_histologies.tsv", sep='delimiter', header=None)
def clean_opentarget_histologies_files(df):
    cohort = import_cbtn_biosample_metadata()
    df = df[df.sample_id.isin(cohort.sample_id)]
    df = df[df.sample_type == 'Tumor'] # Drop normals
    df = df[df.composition != 'Derived Cell Line'] # Drop cell lines
    df = df[df.experimental_strategy != "Targeted Sequencing"] # these metadata are very different
    df = df.drop(["RNA_library","seq_center","pathology_free_text_diagnosis","gtex_group","gtex_subgroup","normal_fraction",
                  "cell_line_composition","cell_line_passage","tumor_fraction_RFpurify_ABSOLUTE",
                  "tumor_fraction_RFpurify_ESTIMATE","tumor_fraction_LUMP","dkfz_v12_methylation_mgmt_status",
                  "dkfz_v12_methylation_mgmt_estimated","integrated_diagnosis",
                  "tumor_fraction","tumor_ploidy","cohort"],axis=1) # drop columns we know we don't want
    df = df.replace({
        'composition':{
            "Not Available": None,
        },
        'extent_of_tumor_resection':{
            "Not Reported":None,
            'Unavailable':None,
            'Not Applicable':None
        },
    })
    
    # correct known errors
    df.loc["BS_K14VJ1E3","age_at_diagnosis_days"] = 2778
    df = df.drop(["BS_03G6PJKJ","BS_HJJPT3NR","BS_15R0SQRN","BS_GGDMSB26","BS_VQGR0D61"]) # lots of biosamples for the same tumor at the same timepoint
    
    # Propagate metadata from the same sample_id
    g = df.groupby('sample_id')
    df = []
    for name, group in g:
        columns = [col for col in group.columns if col not in ['sample_id','aliquot_id','experimental_strategy']]
        for column in columns:
            unique_values = group[column].dropna().unique()
            if len(unique_values) == 0:
                continue
            elif len(unique_values) == 1:
                non_na_value = unique_values[0]
                group[column].fillna(non_na_value, inplace=True)
            else:
                print(f"Warning: The column '{column}' for sample {name} differs between CAVATICA and opentarget annotations.")
        group=group.sort_values('experimental_strategy')
        df.append(group)
    df = pd.concat(df)
    
    # Add entries missing a KF biospecimen ID, but with a matching external biosample id.
    missing_bs = (cohort[~cohort.index.isin(df.index)]["sample_id"]).sort_values()
    print(f"{len(missing_bs)} KF biospecimens missing from the opentarget histologies table...")
    missing_bs = missing_bs[missing_bs.isin(df.sample_id)]
    print(f"found {len(missing_bs)} matching external sample IDs, adding to table...")
    newdf = []
    for biospecimen, sample in missing_bs.items():
        newentry=df[df.sample_id==sample].iloc[0]
        newentry.name = biospecimen
        newdf.append(newentry)
    newdf = pd.DataFrame(newdf)
    df = pd.concat([df,newdf])
    
    # Subset our cohort
    df = df[df.index.isin(cohort.index)]
    return df
clean_opentarget_histologies_files(df)

  df = pd.read_csv("pedpancan_histologies.tsv", sep='delimiter', header=None)


AttributeError: 'DataFrame' object has no attribute 'sample_id'

In [6]:
def import_opentarget_histologies_files(path='owen_histologies.tsv'):
    '''
    Get this file from /Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/opentarget/histologies.tsv
    '''
    path = pathlib.Path(path)
    df = pd.read_csv(path,sep='\t',index_col=0)
    df = clean_opentarget_histologies_files(df)
    return df
import_opentarget_histologies_files()

  df = pd.read_csv(path,sep='\t',index_col=0)


53 KF biospecimens missing from the opentarget histologies table...
found 41 matching external sample IDs, adding to table...


Unnamed: 0,sample_id,aliquot_id,Kids_First_Participant_ID,experimental_strategy,sample_type,composition,tumor_descriptor,primary_site,reported_gender,race,...,dkfz_v11_methylation_subclass_score,dkfz_v12_methylation_subclass,dkfz_v12_methylation_subclass_score,molecular_subtype,Notes,harmonized_diagnosis,molecular_subtype_methyl,broad_histology,short_histology,cancer_group
BS_1RFBH1SP,7316-10,717017,PT_K8ZV7APT,WGS,Tumor,Solid Tissue,Initial CNS Tumor,Spinal Cord- Lumbar/Thecal Sac,Female,Black or African American,...,0.598,NFIB_PLEX,0.507953,,,Neurofibroma/Plexiform,,Tumor of cranial and paraspinal nerves,Neurofibroma,Neurofibroma/Plexiform
BS_SFZ3A07S,7316-100,67852,PT_6TZR2DH1,WGS,Tumor,Solid Tissue,Initial CNS Tumor,Suprasellar/Hypothalamic/Pituitary,Female,Black or African American,...,0.668,CPH_ADM,0.580744,"CRANIO, ADAM",Updated via OpenPedCan subtyping,Adamantinomatous craniopharyngioma,,Tumors of sellar region,Craniopharyngioma,Adamantinomatous Craniopharyngioma
BS_4RS1SC48,7316-101,588044,PT_CWD717Q0,WGS,Tumor,Solid Tissue,Initial CNS Tumor,Temporal Lobe,Male,Black or African American,...,0.479,PA_CORT,0.825647,"GNG, other MAPK",Updated via OpenPedCan subtyping,"Ganglioglioma, other MAPK","GNT, MAPK",Low-grade astrocytic tumor,Ganglioglioma,Ganglioglioma
BS_MRX1SPFB,7316-104,717047,PT_H024RK87,WGS,Tumor,Solid Tissue,Recurrence,Brain Stem- Midbrain/Tectum;Pineal Gland;Thala...,Male,Asian,...,0.230,GCT_TERA,0.972105,,,Non-germinomatous germ cell tumor;Teratoma,,Germ cell tumor,Germ cell tumor,
BS_A9S5HT6P,7316-1069,717042,PT_9XAA8PF6,WGS,Tumor,Solid Tissue,Initial CNS Tumor,Suprasellar/Hypothalamic/Pituitary,Female,White,...,,,,,,Adenoma,,Benign tumor,Adenoma,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BS_FJEZ3ASV,7316-913,1108019,PT_4347ZBEX,Methylation,Tumor,Solid Tissue,Second Malignancy,Parietal Lobe;Temporal Lobe,Female,White,...,0.999,pedHGG_RTK1C,0.982681,"HGG, H3 wildtype",Updated via OpenPedCan subtyping,"High-grade glioma, IDH-wildtype and H3-wildtype","HGG, H3 wildtype",Diffuse astrocytic and oligodendroglial tumor,HGAT,High-grade glioma
BS_5DHZ0T1A,7316-925,1112611,PT_3HRS5CWF,Methylation,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa,Female,White,...,0.993,PA_INF,0.999797,"LGG, KIAA1549-BRAF",Updated via OpenPedCan subtyping,"Low-grade glioma, KIAA1549-BRAF","LGG, FGFR",Low-grade astrocytic tumor,LGAT,Low-grade glioma
BS_W37QBA12,7316-931,1112899,PT_XA98HG1C,Methylation,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa,Male,White,...,0.906,MB_SHH_3,0.480522,"MB, SHH",Subtype based on prediction;Updated via OpenPe...,"Medulloblastoma, SHH-activated",,Embryonal tumor,Medulloblastoma,Medulloblastoma
BS_ZVM1B6BF,7316-95,608916,PT_9S6WMQ92,RNA-Seq,Tumor,Solid Tissue,Progressive,Cerebellum/Posterior Fossa,Female,White,...,,,,"MB, SHH",Subtype based on prediction;Updated via OpenPe...,"Medulloblastoma, SHH-activated",,Embryonal tumor,Medulloblastoma,Medulloblastoma


In [7]:
def import_pedcbioportal_metadata(path="openpbta-biosample-metadata.tsv"):
    path = pathlib.Path(path)
    df = pd.read_csv(path, sep='\t',index_col=0)
    return df
import_pedcbioportal_metadata()

Unnamed: 0_level_0,sampleId,CANCER_TYPE,CANCER_TYPE_DETAILED,CNS_REGION,EXPERIMENTAL_STRATEGY,EXTENT_OF_TUMOR_RESECTION,HISTOLOGY,MUTATION_COUNT,PATHOLOGY_FREE_TEXT_DIAGNOSIS,SAMPLE_TYPE,SPECIMEN_ID,TUMOR_FRACTION,TUMOR_PLOIDY,TUMOR_TISSUE_SITE,TUMOR_TYPE
patientId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
PT_00G007DM,7316-2577,Embryonal tumor,"CNS Embryonal tumor, NOS",Posterior fossa,WGS;RNA-Seq,Gross/Near total resection,Embryonal tumor with multilayer rosettes,2.0,ependymoblastoma,Solid Tissue,BS_AQMKA8NC;BS_FEPRNEXX,0.613054,4.0,Cerebellum/Posterior Fossa,recurrence
PT_00G007DM,7316-272,Embryonal tumor,"Embryonal tumor with multilayer rosettes, C19M...",Posterior fossa,WGS;RNA-Seq,Gross/Near total resection,Embryonal tumor with multilayer rosettes,4.0,embryonal tumor with abundant neuropil and tru...,Solid Tissue,BS_K07KNTFY;BS_QWNBZ9RJ,0.328185,4.0,Cerebellum/Posterior Fossa,primary
PT_01MZ62KG,7316-447,Embryonal tumor,"Embryonal tumor with multilayer rosettes, C19M...",Posterior fossa,RNA-Seq;WGS,Partial resection,Embryonal tumor with multilayer rosettes,11.0,pineoblastoma,Solid Tissue,BS_TE8QFF7T;BS_P39SQPTS,,,Cerebellum/Posterior Fossa,progression
PT_02J5CWN5,7316-2989,Low-grade astrocytic tumor,"Low-grade glioma/astrocytoma, FGFR",Suprasellar,WGS;RNA-Seq,Partial resection,Low-grade glioma astrocytoma,7.0,low-grade glioma,Solid Tissue,BS_2VB8649B;BS_M8WP5T16,1.000000,2.0,Suprasellar/Hypothalamic/Pituitary,primary
PT_02J5CWN5,7316-898,Low-grade astrocytic tumor,"Low-grade glioma/astrocytoma, wildtype",Suprasellar,RNA-Seq;WGS,Partial resection,Low-grade glioma astrocytoma,1.0,low-grade glioma,Solid Tissue,BS_5TT6TT4K;BS_1RXZ3BP7,,,Suprasellar/Hypothalamic/Pituitary,progression
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PT_ZZRBX5JT,7316-2661,Ependymal tumor,Ependymoma,Posterior fossa,RNA-Seq;WGS,Gross/Near total resection,Ependymoma,17.0,ependymoma,Solid Tissue,BS_JE1DF581;BS_HZ707C55,,,Cerebellum/Posterior Fossa,recurrence
PT_ZZRBX5JT,7316-3319,Ependymal tumor,Ependymoma,Spine,WGS;RNA-Seq,Gross/Near total resection,Ependymoma,5.0,ependymoma,Solid Tissue,BS_9N3B3HZB;BS_YE1MAQYJ,0.503286,2.0,Spinal Cord- Lumbar/Thecal Sac,recurrence
PT_ZZRBX5JT,7316-496,Ependymal tumor,Ependymoma,Ventricles,WGS;RNA-Seq,Gross/Near total resection,Ependymoma,2.0,ependymoma,Solid Tissue,BS_AGD2ATY1;BS_PJPGVD62,0.256150,2.0,Ventricles,primary
PT_ZZTQQVCT,7316-2723,Low-grade astrocytic tumor,"Low-grade glioma/astrocytoma, wildtype",Posterior fossa,WGS;RNA-Seq,Gross/Near total resection,Low-grade glioma astrocytoma,1.0,low-grade glioma,Solid Tissue,BS_78TQS4Y3;BS_Y7F9E2E9,1.000000,2.0,Cerebellum/Posterior Fossa,primary


In [8]:
def get_cbtn_cell_lines():
    df = import_pedcbioportal_metadata()
    df = df[df.SAMPLE_TYPE == "Derived Cell Line"]
    return df.SPECIMEN_ID.str.cat(sep=';').split(';')

#duplicated_cbtn_samples = ["BS_DRVEFVQ5","BS_169P1QCA","BS_QG6V29H7",
#                           "BS_6GV08HTE","BS_B4PPG3X5","BS_2X60Q1ET","BS_S791VC80","BS_STNH7YSX","BS_3Z40EZHD","BS_ZR75EKKX",
#                           "BS_KQPCYZ2K","BS_Z64NEPNE","BS_KQRAHH6Y","BS_KH3859M5","BS_HJ7HYZ7N","BS_AH3RVK53",
#                           "BS_5S8VXASX","BS_JEZBA2EW","BS_XNYQS1WG","BS_P99S85CY",
#                           "BS_M5FM63EB","BS_M0B42FPR","BS_TX2WGF8K","BS_RENPFFNK","BS_R6CKWZW6",
#                           "BS_ZSH09N84","BS_6TMPZKSZ","BS_B91XGSA5","BS_XQF18WZP","BS_0TCRV9AC",
#                           "BS_2J4FG4HV",
#                           "BS_EE73VE7V","BS_5968GBGT","BS_BQ81D2BP","BS_3VKW5988", # duplicate samples from PT_KTRJ8TFY autopsy
#                           "BS_AK9BV52G","BS_X5VN0FW0","BS_D6STCMQS","BS_22VCR7DF","BS_1Q524P3B" # duplicate samples from PT_KZ56XHJT autopsy
#                          ]
#nontumor_samples = ["BS_MCM78YPC","BS_886M7JMG","BS_TPX7YY57"] # Epilepsy, Arteriovenous malformation, and Reactive connective tissue respectively
get_cbtn_cell_lines()

['BS_CZRA594T',
 'BS_59ZJWJTF',
 'BS_ERAWW3H7',
 'BS_QYPHA40N',
 'BS_PKZ1HWNB',
 'BS_M8EA6R2A',
 'BS_FJEZ3ASV',
 'BS_8ZD6J47V',
 'BS_68TZMZH1',
 'BS_0RQ4P069',
 'BS_AFBPM6CN',
 'BS_PNYN0AYD',
 'BS_TX8C5VAJ',
 'BS_GXTFW99H',
 'BS_QWM9BPDY',
 'BS_BWBDH9GM',
 'BS_HM5GFJN8',
 'BS_2A162JH9',
 'BS_RXP2ZRQT',
 'BS_40MP5BWR',
 'BS_QZRP3NSG',
 'BS_DVDT4VXQ',
 'BS_6JBE0947',
 'BS_YZD4SSMA',
 'BS_M659G06J',
 'BS_5GNQC2FF',
 'BS_VXDGXQKZ',
 'BS_ERFMPQN3',
 'BS_PGK832G2',
 'BS_4DQAQFQH',
 'BS_XMP9XNR9',
 'BS_P9JP6JFA',
 'BS_JGKRN7NA',
 'BS_MX23ZY0Y',
 'BS_853PNV7P',
 'BS_DRY58DTF',
 'BS_TF5TTEXH',
 'BS_E60JZ9Z3']

In [9]:
def propagate(df,dest,source,rename=False):
    '''
    Replace NA values in dest with those in source, then drop source and rename dest
    '''
    df[dest].fillna(df[source], inplace=True)
    df.drop(source, inplace=True, axis=1)
    if rename:
        df = df.rename(columns={dest:rename})
    return df
def consensus(df,dest,source,rename=False):
    '''
    Check that values in dest and source agree, if not then set to NA. Drop source and rename dest.
    '''
    df.loc[df[dest] != df[source], dest] = None
    df.drop(source, inplace=True, axis=1)
    if rename:
        df = df.rename(columns={dest:rename})
    return df    

In [10]:

## Integrate all CBTN data available
def generate_cbtn_biosample_table(verbose=0):
    '''
    Generate a metadata table of cbtn biosamples.
    verbose:
        0: most useful metadata only. These are included in the Supplmentary Table.
        1: includes some extra columns. Useful for generating the patient metadata table.
        2: includes lots of extra columns
    '''
    df = pd.DataFrame(index=get_pedpancan_biosamples_from_AC())
    cavatica_data = import_cbtn_biosample_metadata()
    df = pd.merge(left=df,how='inner',right=cavatica_data,left_index=True,right_index=True)
    opentarget_data = import_opentarget_histologies_files()
    df = pd.merge(left=df,how='left',right=opentarget_data,left_index=True,right_index=True,suffixes=(None,"_y"))

    # For CAVATICA annotations which are missing, propagate those from opentarget.
    df = propagate(df,"primary_site","primary_site_y")
    df = propagate(df,"age_at_diagnosis","age_at_diagnosis_days")
    df = propagate(df,"Tumor Descriptor","tumor_descriptor")
    df = consensus(df,"gender","reported_gender","sex")
    
    # Rename columns
    df.index.name = "biosample_id"
    df = df.rename(columns={
        'Kids First Participant ID':'patient_id',
        'Tumor Descriptor':'tumor_history',
        'case_id':'external_patient_id',
        'sample_id':'external_sample_id',
    })

    # drop columns
    if verbose < 2:
        df = df.drop(["race","ethnicity","external_patient_id","WGS_UUID","Kids First Biospecimen ID Normal",
                      "sample_id_y","composition","Kids_First_Participant_ID","experimental_strategy","sample_type",
                      "germline_sex_estimate","race_y","ethnicity_y","molecular_subtype_methyl","cohort_participant_id","Notes"
                     ],axis=1)  
    if verbose < 1:
        df = df.drop(["primary_site","pathology_diagnosis","OS_days","OS_status","EFS_days","age_last_update_days","aliquot_id",
                      "cancer_predispositions","CNS_region","age_at_chemo_start","age_at_radiation_start","cancer_group",
                      "age_at_event_days","clinical_status_at_event"
                     ],axis=1)
    # Drop nontumor samples
    #df = df.drop(nontumor_samples)
    
    # Drop cell lines
    #df = df[~df.index.isin(get_cbtn_cell_lines())]
    
    # Mark duplicates
    #df['in_deduplicated_sample_cohort'] = True
    #df.loc[duplicated_cbtn_samples,'in_deduplicated_sample_cohort'] = False
    return df

## SJ data
# duplicated_sj_samples are biosamples where we suspect that the same tumor was sequenced twice at the same timepoint. 
# These are flagged and removed arbitrarily.
#duplicated_sj_samples = ['SJST030043_D1','SJST030131_D1','SJMEL001003_D2','SJOS001115_D1','SJMB009_E',
#                         'SJDSRCT030041_D3','SJBT030081_D2','SJWLM030180_D2','SJHGG017_D','SJEWS030228_D2',
#                         'SJST030383_D1','SJLGG017_D','SJLGG030611_D2','SJOS030876_D2','SJOS001101_M1','SJHM030702_D2']
# what to do with SJOS001101_M1?
CBTN_df = generate_cbtn_biosample_table()
CBTN_df.head()

  df = pd.read_csv(path,sep='\t',index_col=0)


53 KF biospecimens missing from the opentarget histologies table...
found 41 matching external sample IDs, adding to table...


Unnamed: 0_level_0,sex,patient_id,disease_type,external_sample_id,tumor_history,age_at_diagnosis,cohort,extent_of_tumor_resection,dkfz_v11_methylation_subclass,dkfz_v11_methylation_subclass_score,dkfz_v12_methylation_subclass,dkfz_v12_methylation_subclass_score,molecular_subtype,harmonized_diagnosis,broad_histology,short_histology
biosample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
BS_85Y70BHE,Female,PT_124TTG01,Adenoma,7316-1960,Diagnosis,5916,PBTA-X00,Gross/Near total resection,"PITAD, STH DNS A",0.857,PITAD_STH_DENSE1,0.967892,,Adenoma,Benign tumor,Adenoma
BS_A9S5HT6P,Female,PT_9XAA8PF6,Adenoma,7316-1069,Diagnosis,4964,PBTA-X00,Gross/Near total resection,,,,,,Adenoma,Benign tumor,Adenoma
BS_1EQGGHHQ,Female,PT_XP8AM1CH,Adenoma,7316-1096,Diagnosis,5713,PBTA-X00,Gross/Near total resection,"PITAD, ACTH",1.0,PITAD_ACTH,0.999998,,Adenoma,Benign tumor,Adenoma
BS_BR1X3R68,Female,PT_ZN1F7RMP,Adenoma,7316-1080,Diagnosis,7032,PBTA-X00,Partial resection,"PITAD, TSH",0.981,PITAD_TSH,0.996686,,Adenoma,Benign tumor,Adenoma
BS_JDMM2XG4,Male,PT_32J909WM,Atypical Teratoid Rhabdoid Tumor (ATRT),7316-2688,Diagnosis,860,PBTA-X00,Gross/Near total resection,"ATRT, TYR",0.594,ATRT_SHH,0.105467,"ATRT, To be classified",Atypical Teratoid Rhabdoid Tumor (ATRT),Embryonal tumor,ATRT


In [11]:
def import_sj_sample_info(path="SAMPLE_INFO_2022-03-02.tsv"):
    path = pathlib.Path(path)
    df = pd.read_csv(path,sep='\t',index_col="sample_name")
    return df
def clean_sj_biosample_metadata(df):
    '''
    Clean known errors in the sj metadata, and unify ontologies, units etc.
    '''
    df = df.replace({
        'attr_age_at_diagnosis':{
            "Not Available": None
        },
        'attr_sex':{
            "Not Available":None
        }
    })
    
    # Convert age from years to days
    df['attr_age_at_diagnosis'] = (pd.to_numeric(df['attr_age_at_diagnosis'],errors='coerce')*365.25).round()
    
    return df
def generate_sj_biosample_table(verbose=0):
    '''
    Notes:
    sj_diseases != attr_oncotree_disease_code = sj_associated_diagnoses_disease_code
    attr_diagnosis != sj_long_disease_name != sj_associated_diagnoses
    '''
    df = pd.DataFrame(index=get_pedpancan_biosamples_from_AC())
    columns = ['subject_name','sample_type','attr_age_at_diagnosis','attr_sex','sj_long_disease_name','sj_diseases','attr_oncotree_disease_code','sj_dataset_accessions']
    add = import_sj_sample_info()
    add = clean_sj_biosample_metadata(add)
    add = add[(add.sequencing_type == 'WGS') & (add.file_type == 'BAM') & add.file_path.str.endswith('.bam')]
#    add = add.sort_values(columns)
#    add = add.loc[~add.index.duplicated()]
    df = pd.merge(left=df,how='inner',right=add, left_index=True, right_index=True)
    
    # Rename columns
    df.index.name = "biosample_id"
    df = df.rename(columns={
        'subject_name':'patient_id',
        'sample_type':'tumor_history',
        'attr_sex':'sex',
        'sj_dataset_accessions':'cohort',
        'attr_age_at_diagnosis':'age_at_diagnosis',
    })
    # drop columns
    if verbose < 2:
        df = df.drop(["file_path","file_id","sequencing_type","file_type","description","sj_embargo_date","attr_ethnicity","attr_race",
                      "sj_genome_build","sj_pipeline_name","sj_pipeline_version","attr_library_selection_protocol","attr_read_length",
                      "attr_sequencing_platform","attr_read_type","attr_tissue_preservative","attr_inferred_strandedness",
                      "attr_lab_strandedness","attr_germline_sample"
        ],axis=1)
    if verbose < 1:
        df = df.drop(["sj_pmid_accessions","sj_publication_titles","sj_pub_accessions","sj_datasets","sj_ega_accessions","attr_diagnosis",
                      "attr_diagnosis_group","attr_oncotree_disease_code","attr_subtype_biomarkers","sj_associated_diagnoses",
                      "sj_associated_diagnoses_disease_code"
        ],axis=1)
    
    # Mark duplicates
    #df['in_deduplicated_sample_cohort'] = True
    #df.loc[duplicated_sj_samples,'in_deduplicated_sample_cohort'] = False
    return df

SJ_df = import_sj_sample_info()
SJ_df = clean_sj_biosample_metadata(SJ_df)
SJ_df = generate_sj_biosample_table()
SJ_df.head()

Unnamed: 0_level_0,patient_id,tumor_history,sj_long_disease_name,cohort,sj_diseases,age_at_diagnosis,sex
biosample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SJACT001_D,SJ006401,Diagnosis,Adrenocortical Carcinoma,SJC-DS-1001,ACC,1088.0,Female
SJACT002_D,SJ006402,Diagnosis,Adrenocortical Carcinoma,SJC-DS-1001,ACC,961.0,Female
SJACT003_D,SJ006403,Diagnosis,Adrenocortical Carcinoma,SJC-DS-1001,ACC,475.0,Female
SJACT004_D,SJ006404,Diagnosis,Adrenocortical Carcinoma,SJC-DS-1001,ACC,1819.0,Female
SJACT005_D,SJ006405,Diagnosis,Adrenocortical Carcinoma,SJC-DS-1001,ACC,5621.0,Female


In [47]:
# Function to create the cancer_subtype column based on priority

def get_subtype(row):
    priority_columns = ['molecular_subtype','dkfz_v12_methylation_subclass',
                    'dkfz_v11_methylation_subclass', 'harmonized_diagnosis', 'disease_type', "sj_diseases"]  # Add other columns as needed
    for col in priority_columns:
        if col in ['dkfz_v12_methylation_subclass', 'dkfz_v11_methylation_subclass'] and pd.notnull(row[col]) and row[f"{col}_score"] > 0.9:
            return row[col]
        elif col not in ['dkfz_v12_methylation_subclass', 'dkfz_v11_methylation_subclass'] and pd.notnull(row[col]):
            return row[col]
    return None
def unify_tumor_diagnoses(df):
    # Apply the function to create the cancer_subtype column
    mapping = pd.read_excel("pedpancan_mapping.xlsx", sheet_name='filtered_mapping')
    
    # Create a dictionary from the mapping
    mapping_dict = dict(zip(mapping['Classification'], mapping['Owen']))
    
    # Create 'cancer_type_pre' column using the get_subtype function
    df['cancer_subtype'] = df.apply(get_subtype, axis=1)
    
    # Map 'cancer_type_pre' to 'owen' using mapping_dict
    df['cancer_type'] = df['cancer_subtype'].map(mapping_dict)
    
    # # Create 'cancer_type' column as a copy of 'cancer_type_pre'
    # df['cancer_type'] = df['cancer_type_pre'].copy()
    
    # # if owen column is na for a specific sample it gets assigned to get_subtype() column//
    # df['cancer_type'] = df['cancer_type_post'].combine_first(df['cancer_type'])

    df = df.drop(["disease_type","dkfz_v11_methylation_subclass","dkfz_v11_methylation_subclass_score",
                  "dkfz_v12_methylation_subclass","dkfz_v12_methylation_subclass_score","molecular_subtype","harmonized_diagnosis",
                  "broad_histology","short_histology", "sj_long_disease_name", "sj_diseases"
                 ],axis=1)
    return df
df = pd.concat([CBTN_df, SJ_df])
df = unify_tumor_diagnoses(df)
df.head()



Unnamed: 0_level_0,sex,patient_id,external_sample_id,tumor_history,age_at_diagnosis,cohort,extent_of_tumor_resection,cancer_subtype,cancer_type
biosample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BS_85Y70BHE,Female,PT_124TTG01,7316-1960,Diagnosis,5916.0,PBTA-X00,Gross/Near total resection,PITAD_STH_DENSE1,BENG
BS_A9S5HT6P,Female,PT_9XAA8PF6,7316-1069,Diagnosis,4964.0,PBTA-X00,Gross/Near total resection,Adenoma,BENG
BS_1EQGGHHQ,Female,PT_XP8AM1CH,7316-1096,Diagnosis,5713.0,PBTA-X00,Gross/Near total resection,PITAD_ACTH,BENG
BS_BR1X3R68,Female,PT_ZN1F7RMP,7316-1080,Diagnosis,7032.0,PBTA-X00,Partial resection,PITAD_TSH,BENG
BS_JDMM2XG4,Male,PT_32J909WM,7316-2688,Diagnosis,860.0,PBTA-X00,Gross/Near total resection,"ATRT, To be classified",ATRT


In [48]:
## Annotate with ecDNA status
def annotate_with_ecDNA(df,path="Supplementary_Tables.xlsx"):
    '''
    Annotate biosamples with ecDNA status.
    Inputs:
        df: pd.DataFrame. Must be indexed by biosample.
        path: path to AmpliconClassifier results.
    '''
    # load AC results
    if path.endswith("Supplementary_Tables.xlsx"):
        ac = pd.read_excel(path,sheet_name="3. Amplicons")
    else:
        ac = pd.read_excel(path,index_col=0)
    
    # Aggregate by biosample
    ac_agg = ac.groupby("sample_name").sum().ecDNA_amplicons
    df = df.join(ac_agg)
    df = df.rename(columns={"ecDNA_amplicons":"ecDNA_sequences_detected"})
    df["ecDNA_sequences_detected"].fillna(0,inplace=True)
    return df

def generate_biosample_table():
    df = pd.concat([SJ_df, CBTN_df])
    df = unify_tumor_diagnoses(df)
    df = annotate_with_ecDNA(df)
    return df


In [49]:
df = generate_biosample_table()
df.head()
# df.to_csv("test_biosample_table.csv")

Unnamed: 0_level_0,patient_id,tumor_history,cohort,age_at_diagnosis,sex,external_sample_id,extent_of_tumor_resection,cancer_subtype,cancer_type,ecDNA_sequences_detected
biosample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BS_007JTNB8,PT_1MW98VR1,Diagnosis,PBTA-X00,1872.0,Male,7316-2558,Gross/Near total resection,"EPN, PF A",EP,0.0
BS_00TRPEQX,PT_MDWPRDBT,Recurrence,PBTA-X00,2146.0,Female,7316-2660,Gross/Near total resection,"HGG, H3 wildtype",HGG,1.0
BS_01DQH017,PT_CYVVA9AB,Diagnosis,PBTA-X00,4738.0,Female,7316-3299,Gross/Near total resection,"EPN, ST ZFTA",EP,0.0
BS_01Y5F4PN,PT_QK4XB83F,Diagnosis,PBTA-X00,1546.0,Female,7316-663,Gross/Near total resection,"SEGA, To be classified",SEGA,0.0
BS_02W5H7K5,PT_J06X17XH,Diagnosis,PBTA-X00,5146.0,Male,7316-1649,Gross/Near total resection,Rosai-dorfman,BENG,0.0


In [51]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(len(df))
print(len(df[df.ecDNA_sequences_detected > 0.0]))

def manually_fill_nas(df):
    '''
    Function to fill samples with NAs in cancer_type (those without Classification) manually based on Owen
    '''
    samples_to_assign = ["BS_AH3RVK53", "BS_JEZBA2EW", "BS_KQPCYZ2K"]

    # Assign values manually based on biosample_id
    df.loc[df.index.isin(samples_to_assign), 'cancer_type'] = ['BTNOS', 'HGG', 'BTNOS']
    return df
df = manually_fill_nas(df)
# df.to_csv("biosample_table.csv")
nas = df[df.cancer_type.isna()]
print(len(nas))
nas

1717
178
0


Unnamed: 0_level_0,patient_id,tumor_history,cohort,age_at_diagnosis,sex,external_sample_id,extent_of_tumor_resection,cancer_subtype,cancer_type,ecDNA_sequences_detected
biosample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [None]:
## Imports for Sunita's data
def import_sunita_classifications(path='../data/combinedamplicons.xlsx'):
    df = pd.read_excel(path)
    df = df[["sample_ID","cancer_type"]]
    df = df.drop_duplicates()
    df = df.set_index("sample_ID")
    return df