In [505]:
import pandas as pd
import pathlib
import os

pd.set_option('display.max_columns', None)

In [513]:
def import_openpbta_biosample_metadata(path="../2023-11-27_cavatica-api/out/openpbta-biosample-metadata.tsv"):
    '''
    NB use HISTOLOGY for diagnosis
    Index(['patientId', 'CANCER_TYPE', 'CANCER_TYPE_DETAILED', 'CNS_REGION',
       'EXPERIMENTAL_STRATEGY', 'EXTENT_OF_TUMOR_RESECTION', 'HISTOLOGY',
       'MUTATION_COUNT', 'PATHOLOGY_FREE_TEXT_DIAGNOSIS', 'SAMPLE_TYPE',
       'SPECIMEN_ID', 'TUMOR_FRACTION', 'TUMOR_PLOIDY', 'TUMOR_TISSUE_SITE',
       'TUMOR_TYPE']
    '''
    path = pathlib.Path(path)
    df = pd.read_csv(path, sep='\t',index_col=1)
    return df

def clean_opentarget_histologies_biosamples(df):
    '''
    Add missing BS_ biospecimen IDs based on matching sample IDs
    '''
    x00 = pd.concat([import_x00_biosample_metadata(),import_pnoc_biosample_metadata()])
    missing_bs = (x00[~x00.index.isin(df.index)]["sample_id"]).sort_values()
    print(f"{len(missing_bs)} KF biospecimens missing from the opentarget histologies table...")
    missing_bs = missing_bs[missing_bs.isin(df.sample_id)]
    print(f"found {len(missing_bs)} matching external sample IDs, adding to table...")
    newdf = []
    for biospecimen, sample in missing_bs.items():
        newentry=df[df.sample_id==sample].iloc[0]
        newentry.name = biospecimen
        newdf.append(newentry)
    newdf = pd.DataFrame(newdf)
    df = pd.concat([df,newdf])
    return df
    
def import_opentarget_histologies_biosamples(path='../data/local/opentarget/histologies-wgs-cohort.xlsx'):
    path = pathlib.Path(path)
    df = pd.read_excel(path,index_col='sample_id')
    df["Kids_First_Biospecimen_ID"]=df["Kids_First_Biospecimen_ID"].map(lambda x:x.split(';'))
    df = df.explode("Kids_First_Biospecimen_ID")
    df['sample_id'] = df.index
    df = df.set_index("Kids_First_Biospecimen_ID")
    df = clean_opentarget_histologies_biosamples(df)
    return df
def import_opentarget_histologies_files(path='/Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/metadata/histologies.tsv'):
    path = pathlib.Path(path)
    df = pd.read_csv(path,sep='\t')
    return df

def import_sunita_cbtn_master_table(sheet='1a Patients',path='/Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/latest/PedPanCancer_CBTN_MasterAnalysis_Copy.xlsx'):
    """
    Patient_ID	Source	Database	ecDNA	amplicon_type	CancerAbbr	Cancer_Type_Abbrev	Age	Cancer_Type	Cancer_Type_Detailed	CNS_region	Ethnicity	Fraction Genome Altered	Mutation Count	Oncotree_Code	OS_Months	OS_Months_5yrCensor	OS_Status	OS_Status_5yrCensor	PFS_Months	Race	Number of Samples Per Patient	Sample_Type	Sex	Tumor_Tissue_Site	Tumor_Type	Notes	EGFR_SV	ec_EGFR	EGFR_Status	EGFR_SV_Status	EGFR_RNA_Expression	ATRX_RNA_Expression_Zscore	BRAF_RNA_Expression_Zscore	GACAT3_RNA_Expression_Zscore	EGFR_RNA_Expression_Zscore	H3F3A_RNA_Expression_Zscore	KIAA1549_RNA_Expression_Zscore	MYC_RNA_Expression_Zscore	MYCN_RNA_Expression_Zscore	PLAG1_RNA_Expression_Zscore	PVT1_RNA_Expression_Zscore	TARSL2_RNA_Expression_Zscore	TP53_RNA_Expression_Zscore	WFS1_RNA_Expression_Zscore
    """
    path = pathlib.Path(path)
    index = None
    if sheet == '1a Patients':
        index = "Patient_ID"
    elif sheet == '1b Samples':
        index = "Specimen_ID"
    df = pd.read_excel(path,sheet_name=sheet,index_col=index)
    return df
def import_sunita_sj_master_table(path="/Users/ochapman/Library/CloudStorage/OneDrive-SanfordBurnhamPrebysMedicalDiscoveryInstitute/projects/2023-pedpancan/data/latest/SJ_SurvivalMaster.xlsx"):
    path=pathlib.Path(path)
    df = pd.read_excel(path,index_col=0)
    return df
def clean_sunita_survival_table(df):
    df['Overall Survival in Months'] = pd.to_numeric(df['Overall Survival in Months'].astype(str).str.replace('m', '', regex=False), errors='coerce')
    return df
def import_sunita_survival_table(path="../data/combinedsurv.xlsx"):
    path = pathlib.Path(path)
    df = pd.read_excel(path,index_col=0)
    df = clean_sunita_survival_table(df)
    return df
def import_sunita_amplicons_table(path="../data/combinedamplicons.xlsx"):
    path = pathlib.Path(path)
    df = pd.read_excel(path)
    return df
def import_sj_sample_info(path="/Users/ochapman/projects/pedpancan_ecdna/2022-02-23_sj_samples/SAMPLE_INFO_2022-03-02.tsv"):
    path = pathlib.Path(path)
    df = pd.read_csv(path,sep='\t',index_col="sample_name")
    return df
def get_pedpancan_biosamples_from_AC(include_x01=False,path='../data/local/AmpliconClassifier/pedpancan_summary_map.txt'):
    path = pathlib.Path(path)
    df = pd.read_csv(path, sep='\t', header=None, index_col=0, names = ["biosample","file"])
    if not include_x01:
        df['firstletter']=df.index.map(lambda x: x[0])
        df = df[df.firstletter != 'P']
    return df.index

In [514]:
#df = import_x01_biosample_metadata()
#df = import_openpbta_biosample_metadata()
#df = get_pedpancan_biosamples_from_AC()
#df = import_sunita_patient_table()
#df = import_sunita_survival_table()
#df = import_sunita_amplicons_table()
#df = import_sunita_sj_master_table()
#df = import_sj_sample_info("/Users/ochapman/projects/pedpancan_ecdna/2022-02-23_sj_samples/SAMPLE_INFO_batch_2022-03-03.txt")
#(df.loc["SJAML001417_D1","Date of Primary Dx"]-df.loc["SJAML001417_D1","Date of Death"]).days
#df = get_pedpancan_biosamples_from_AC()
#CBTN_samples = set(df[df.map(lambda x: x.startswith('B'))])
df = import_pnoc_biosample_metadata()
df

Unnamed: 0_level_0,gender,race,ethnicity,Kids First Participant ID,disease_type,sample_id,Tumor Descriptor,primary_site,age_at_diagnosis,case_id,WGS_UUID
Kids First Biospecimen ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BS_8SYN7GXG,Male,White,Not Hispanic or Latino,PT_0MXPTTM3,Anaplastic Astrocytoma,7316-3220,Diagnosis,Brain Stem,9425,C3080535,3526c1ab-a793-4c5f-9576-d922835dbd78
BS_DRVEFVQ5,Male,Reported Unknown,Reported Unknown,PT_1AAYYGGY,Anaplastic Astrocytoma,7316-4996,Diagnosis,Brain Stem,4608,C3079920,d182b330-17aa-47de-9c3a-5fee37a4ee33
BS_Y96RP1HJ,Male,Reported Unknown,Reported Unknown,PT_1AAYYGGY,Anaplastic Astrocytoma,7316-4996,Diagnosis,Brain Stem,4608,C3079920,e3629477-53cc-40c6-a4f6-72ecc70a4451
BS_7GKF6M85,Female,Asian,Not Hispanic or Latino,PT_1E3E6GMF,Diffuse Astrocytoma,7316-3224,Diagnosis,Brain Stem,2211,C3081150,2865b67b-1458-492e-a39e-57900f34da9f
BS_169P1QCA,Male,White,Not Hispanic or Latino,PT_1YQH5NSH,Diffuse midline glioma; H3K27M mutant; WHO gra...,7316-5922,Diagnosis,Brain Stem,2520,C3093819,64b37304-0567-4f2c-9790-b3cce218955c
...,...,...,...,...,...,...,...,...,...,...,...
BS_VXDGXQKZ,Female,Reported Unknown,Reported Unknown,PT_VPEMAQBN,Anaplastic Astrocytoma,7316-3235,Diagnosis,Brain Stem,1709,C3078444,7ad281cf-82c0-46b9-9a23-8348d623ba77
BS_38CD519Z,Male,Asian,Hispanic or Latino,PT_W5GP3F6B,Diffuse midline glioma; H3K27M mutant; WHO gra...,7316-5003,Diagnosis,Brain Stem,2459,C3092712,80fd23ab-de59-45da-80f6-49af162fe982
BS_4DQAQFQH,Female,Reported Unknown,Hispanic or Latino,PT_WGVEF96B,Diffuse Astrocytoma,7316-4446,Diagnosis,Brain Stem,2853,C3080043,3d9ec140-cf1f-48e0-b4bd-d9ae775b29b0
BS_TQ0J7WJQ,Female,Reported Unknown,Hispanic or Latino,PT_WGVEF96B,Diffuse Astrocytoma,7316-3219,Diagnosis,Brain Stem,2853,C3080043,c2e040bd-64b7-4928-8428-1da909cb2873


In [452]:
# Which data are useful?
# 993 total CBTN biosamples
biosamples = set(get_pedpancan_biosamples_from_AC())
print(len(biosamples))
# x00: 921 biosamples in dataset
x00 = set(import_x00_biosample_metadata().index)
print(len(biosamples & x00))
# pnoc: 72 biosamples in dataset
x00 = set(import_pnoc_biosample_metadata().index)
print(len(biosamples & pnoc))
# opentarget: 980 biosamples in dataset
ot = set(import_opentarget_histologies_biosamples().index)
print(len(biosamples & ot))

# sj_master: 717 biosamples in dataset
# x00, pnoc mutually exclusive
# SAMPLE_INFO_batch_2022-03-03

sjinfo="/Users/ochapman/projects/pedpancan_ecdna/2022-02-23_sj_samples/SAMPLE_INFO_batch_2022-03-03.txt" # 0 overlap
sjinfo="/Users/ochapman/projects/pedpancan_ecdna/2022-02-23_sj_samples/SAMPLE_INFO_2022-03-02.tsv" # 717 overlap
sjinfo="/Users/ochapman/projects/pedpancan_ecdna/2022-02-23_sj_samples/SAMPLE_INFO PedPanCancer ecDNA.txt" # 717 overlap but the metadata is less complete
sjinfo="/Users/ochapman/projects/pedpancan_ecdna/2022-02-23_sj_samples/SAMPLE_INFO PedPanCan_clinical.txt" # 256 overlap
sj = set(import_sj_sample_info(sjinfo).index)
print(len(ot & sj))

print((x00 - ot))

1710
921
72
37 KF biospecimens missing from the opentarget histologies table...
found 24 matching external sample IDs, adding to table...
980
0
{'BS_4DQAQFQH', 'BS_M0B42FPR', 'BS_QZRP3NSG', 'BS_M5FM63EB', 'BS_JEZBA2EW', 'BS_6JBE0947', 'BS_KQPCYZ2K', 'BS_VXDGXQKZ', 'BS_AH3RVK53'}


In [515]:
def _unify_diagnoses(df):
    pass

def generate_cbtn_biosample_table():
    '''
    '''
    df = pd.DataFrame(index=get_pedpancan_biosamples_from_AC())
    cavatica_data = pd.concat([import_x00_biosample_metadata(),import_pnoc_biosample_metadata()])
    cavatica_data = cavatica_data[['gender','Kids First Participant ID','disease_type','sample_id','Tumor Descriptor','primary_site','age_at_diagnosis']]
    df = pd.merge(left=df,how='inner',right=cavatica_data,left_index=True,right_index=True)
    opentarget_data = import_opentarget_histologies_biosamples()
    #
    opentarget_data = opentarget_data.drop(["sample_id","composition"],axis=1)
    #opentarget_data = opentarget_data[[]]
    # don't include race, ethnicity
    df = pd.merge(left=df,how='left',right=opentarget_data,left_index=True,right_index=True)
    
    df
    return df

df = generate_cbtn_biosample_table()
df[df.disease_type != df.pathology_diagnosis]

37 KF biospecimens missing from the opentarget histologies table...
found 24 matching external sample IDs, adding to table...


Unnamed: 0,gender,Kids First Participant ID,disease_type,sample_id,Tumor Descriptor,primary_site_x,age_at_diagnosis,aliquot_id,Kids_First_Participant_ID,sample_type,tumor_descriptor,primary_site_y,reported_gender,race,ethnicity,age_at_diagnosis_days,pathology_diagnosis,RNA_library,OS_days,OS_status,EFS_days,cohort,age_last_update_days,seq_center,cancer_predispositions,pathology_free_text_diagnosis,cohort_participant_id,extent_of_tumor_resection,CNS_region,gtex_group,gtex_subgroup,germline_sex_estimate,normal_fraction,tumor_fraction,tumor_ploidy,age_at_event_days,clinical_status_at_event,age_at_chemo_start,age_at_radiation_start,cell_line_composition,cell_line_passage,tumor_fraction_RFpurify_ABSOLUTE,tumor_fraction_RFpurify_ESTIMATE,tumor_fraction_LUMP,dkfz_v11_methylation_subclass,dkfz_v11_methylation_subclass_score,dkfz_v12_methylation_subclass,dkfz_v12_methylation_subclass_score,dkfz_v12_methylation_mgmt_status,dkfz_v12_methylation_mgmt_estimated,molecular_subtype,integrated_diagnosis,Notes,harmonized_diagnosis,molecular_subtype_methyl,broad_histology,short_histology,cancer_group
BS_EBCJ4Y49,Male,PT_W6CW60K9,Craniopharyngioma,7316-3766,Recurrence,Suprasellar/Hypothalamic/Pituitary,3934,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BS_C3KDA5ZC,Male,PT_YQWG98Z5,Dysplasia/Gliosis;Glial-neuronal tumor NOS,7316-2856,Initial CNS Tumor,Frontal Lobe,6251,1115259,PT_YQWG98Z5,Tumor,Initial CNS Tumor,Frontal Lobe,Male,Black or African American,Not Hispanic or Latino,6251.0,Dysplasia/Gliosis,,411.0,LIVING,411.0,PBTA,6662.0,,None documented,neuroglial tissue with gliosis,C789045,Gross/Near total resection,Hemispheric,,,Male,,,,6251.0,Alive,,,,,0.556153,0.889237,0.836663,"CONTR, WM",0.483,CTRL_CORPCAL,0.999737,unmethylated,0.153483,,,,Dysplasia/Gliosis,,Pre-cancerous lesion,Dysplasia,
BS_HNE6BPMH,Male,PT_V5ASTHRH,Germinoma;Teratoma,7316-3892,Progressive,Pineal Gland,6196,1115244,PT_V5ASTHRH,Tumor,Progressive,Pineal Gland,Male,Reported Unknown,Not Hispanic or Latino,6116.0,Mixed germ cell tumor,,1660.0,LIVING,80.0,PBTA,7776.0,,None documented,Germinoma;Teratoma,C1028157,Gross/Near total resection,Midline,,,Male,,,,6196.0,Alive,6215.0,,,,0.291384,0.522747,0.118345,"CONTR, INFLAM",0.187,GCT_GERM_A,0.830271,methylated,0.449330,,,,Mixed germ cell tumor,,Germ cell tumor,Germ cell tumor,Mixed germ cell tumor
BS_F8K4VQMF,Male,PT_J5NSRW1Y,Gliomatosis Cerebri,7316-3817,Initial CNS Tumor,Frontal Lobe,4599,1114570,PT_J5NSRW1Y,Tumor,Initial CNS Tumor,Frontal Lobe,Male,Black or African American,Not Hispanic or Latino,4599.0,High-grade glioma/astrocytoma (WHO grade III/IV),,492.0,DECEASED,373.0,PBTA,5091.0,,None documented,"glioblastoma, idh-1 negative, who grade iv",C992733,Partial resection,Hemispheric,,,Male,,,,4599.0,Alive,5021.0,4625.0,,,0.664495,0.920429,0.806252,"GBM, RTK III",1.000,pedHGG_RTK2A,0.999860,unmethylated,0.000808,"HGG, H3 wildtype","High-grade glioma, IDH-wildtype and H3-wildtype",Updated via OpenPedCan subtyping,"High-grade glioma, IDH-wildtype and H3-wildtype","HGG, H3 wildtype",Diffuse astrocytic and oligodendroglial tumor,HGAT,High-grade glioma
BS_1YTHM07J,Female,PT_279D9QZ8,High-grade glioma/astrocytoma (WHO grade III/IV),7316-325,Recurrence,Cerebellum/Posterior Fossa,7464,577729,PT_279D9QZ8,Tumor,Recurrence,Cerebellum/Posterior Fossa,Female,White,Not Hispanic or Latino,5859.0,Ganglioglioma,stranded,3011.0,LIVING,202.0,PBTA,8870.0,NantOmics,None documented,ganglioglioma,C17712,Gross/Near total resection,Posterior fossa,,,Female,,,,7464.0,Alive,5862.0,,,,,,,,,,,,,"GNG, wildtype","Ganglioglioma, wildtype",Updated via OpenPedCan subtyping,"Ganglioglioma, wildtype",,Low-grade astrocytic tumor,Ganglioglioma,Ganglioglioma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BS_AH3RVK53,Female,PT_EN2RN5Y1,,SF11385,Diagnosis,,3159,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BS_JEZBA2EW,Female,PT_G16VK7FR,,CNMC-1277,Diagnosis,,2021,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
BS_XNYQS1WG,Female,PT_G16VK7FR,,7316-5814,Diagnosis,,2021,A19681,PT_G16VK7FR,Tumor,Initial CNS Tumor,Pons/Brainstem,Female,Reported Unknown,Not Hispanic or Latino,2021.0,Brainstem glioma- Diffuse intrinsic pontine gl...,,150.0,DECEASED,,PBTA,2171.0,BGI@CHOP Genome Center,None documented,"Diffuse midline glioma, H3K27M mutant, WHO gra...",C3093696,,Midline,,,Female,0.189144,0.810856,2.0,2021.0,,,,,,,,,,,,,,,"DMG, H3 K28, TP53","Diffuse midline glioma, H3 K28-mutant",Updated via OpenPedCan subtyping,"Diffuse midline glioma, H3 K28-mutant",,Diffuse astrocytic and oligodendroglial tumor,HGAT,Diffuse midline glioma
BS_CGXTFM67,Female,PT_M23Q0DC3,,7316-3212,Diagnosis,,1740,A04015,PT_M23Q0DC3,Tumor,Initial CNS Tumor,Pons/Brainstem,Female,Reported Unknown,Not Hispanic or Latino,1740.0,Brainstem glioma- Diffuse intrinsic pontine gl...,,882.0,DECEASED,,PBTA,2622.0,NantOmics,None documented,Diffuse Astrocytoma,C956940,,Midline,,,Female,0.000000,1.000000,2.0,1740.0,,,,,,,,,,,,,,,"DMG, H3 K28","Diffuse midline glioma, H3 K28-mutant",Updated via OpenPedCan subtyping,"Diffuse midline glioma, H3 K28-mutant",,Diffuse astrocytic and oligodendroglial tumor,HGAT,Diffuse midline glioma


In [512]:
df.gender.unique()

array(['Female', 'Male', 'Not Reported'], dtype=object)

In [516]:
def compare(df,left,right):
    print(f"NA values in {left}: {df[left].isna().sum()}")
    print(f"NA values in {right}: {df[right].isna().sum()}")
    print(f"NA values in both: {(df[left].isna() & df[right].isna()).sum()}")
    print(f"Different values: {((df[left] != df[right]) & (~df[left].isna()) & (~df[right].isna())).sum()}")
    return df[((df[left] != df[right]) & (~df[left].isna()) & (~df[right].isna()))]
compare(df,"age_at_diagnosis","age_at_diagnosis_days")
compare(df,"disease_type","pathology_diagnosis")
compare(df,"gender","reported_gender")

NA values in age_at_diagnosis: 0
NA values in age_at_diagnosis_days: 13
NA values in both: 0
Different values: 251
NA values in disease_type: 20
NA values in pathology_diagnosis: 13
NA values in both: 3
Different values: 83
NA values in gender: 52
NA values in reported_gender: 13
NA values in both: 0
Different values: 2


Unnamed: 0,gender,Kids First Participant ID,disease_type,sample_id,Tumor Descriptor,primary_site_x,age_at_diagnosis,aliquot_id,Kids_First_Participant_ID,sample_type,tumor_descriptor,primary_site_y,reported_gender,race,ethnicity,age_at_diagnosis_days,pathology_diagnosis,RNA_library,OS_days,OS_status,EFS_days,cohort,age_last_update_days,seq_center,cancer_predispositions,pathology_free_text_diagnosis,cohort_participant_id,extent_of_tumor_resection,CNS_region,gtex_group,gtex_subgroup,germline_sex_estimate,normal_fraction,tumor_fraction,tumor_ploidy,age_at_event_days,clinical_status_at_event,age_at_chemo_start,age_at_radiation_start,cell_line_composition,cell_line_passage,tumor_fraction_RFpurify_ABSOLUTE,tumor_fraction_RFpurify_ESTIMATE,tumor_fraction_LUMP,dkfz_v11_methylation_subclass,dkfz_v11_methylation_subclass_score,dkfz_v12_methylation_subclass,dkfz_v12_methylation_subclass_score,dkfz_v12_methylation_mgmt_status,dkfz_v12_methylation_mgmt_estimated,molecular_subtype,integrated_diagnosis,Notes,harmonized_diagnosis,molecular_subtype_methyl,broad_histology,short_histology,cancer_group
BS_969K7ZM1,Male,PT_S2ASFBP7,Craniopharyngioma,7316-2157,Initial CNS Tumor,Suprasellar/Hypothalamic/Pituitary,3355,1112042,PT_S2ASFBP7,Tumor,Initial CNS Tumor,Suprasellar/Hypothalamic/Pituitary,Female,Black or African American,Not Hispanic or Latino,3355.0,Craniopharyngioma,,1638.0,LIVING,1638.0,PBTA,4993.0,,None documented,craniopharyngioma who i,C579945,Gross/Near total resection,Suprasellar,,,Female,,,,3355.0,Alive,,,,,0.505627,0.829555,0.761779,"CPH, ADM",0.605,CPH_ADM,0.355549,unmethylated,0.008309,"CRANIO, ADAM",Adamantinomatous craniopharyngioma,Updated via OpenPedCan subtyping,Adamantinomatous craniopharyngioma,,Tumors of sellar region,Craniopharyngioma,Adamantinomatous Craniopharyngioma
BS_XCFNEZ8E,Male,PT_5D3AVPCW,Dysplasia/Gliosis,7316-187,Initial CNS Tumor,Frontal Lobe;Occipital Lobe;Parietal Lobe,184,1119219,PT_5D3AVPCW,Tumor,Initial CNS Tumor,Frontal Lobe;Occipital Lobe;Parietal Lobe,Female,White,Not Hispanic or Latino,184.0,Dysplasia/Gliosis,,,LIVING,,PBTA,184.0,,None documented,dysplasia/gliosis,C42927,Gross/Near total resection,Hemispheric,,,Male,,,,184.0,Alive,,,,,0.496469,0.863404,0.758558,"LGG, GG",0.22,GG,0.391281,unmethylated,0.01175,,,,Dysplasia/Gliosis,,Pre-cancerous lesion,Dysplasia,


In [506]:
def generate_sj_biosample_table():
    '''
    Notes:
    sj_diseases != attr_oncotree_disease_code = sj_associated_diagnoses_disease_code
    attr_diagnosis != sj_long_disease_name != sj_associated_diagnoses
    TODO:
    Add coarse tumor types as sunita's data.
    '''
    df = pd.DataFrame(index=get_pedpancan_biosamples_from_AC())
    columns = ['subject_name','sample_type','attr_age_at_diagnosis','attr_sex','sj_long_disease_name','sj_diseases','sj_dataset_accessions']
    add = import_sj_sample_info()[columns]
    add = add.sort_values(columns)
    add = add.loc[~add.index.duplicated()]
    df = pd.merge(left=df,how='inner',right=add, left_index=True, right_index=True)
    return df
b=generate_sj_biosample_table()
# Setup the output directory
OUT_DIR = pathlib.Path(pathlib.Path.cwd(),"out")
def makedirs(path):
    if not os.path.exists(path):
        # Create the directory
        os.makedirs(path)
makedirs(OUT_DIR)
b.to_csv(pathlib.Path(OUT_DIR,"sj_biosamples.tsv"), sep='\t')

In [501]:
c = import_sunita_sj_master_table()
c

Unnamed: 0_level_0,Survival Status,Date of Primary Dx,Date of Death,Overall Survival in Months,Most Recent Dx,Survival in Months since most recent Dx,Date of data collection
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SJACT001_D,,NaT,NaT,,NaT,,NaT
SJACT001_G,,NaT,NaT,,NaT,,NaT
SJACT002_D,,NaT,NaT,,NaT,,NaT
SJACT002_G,,NaT,NaT,,NaT,,NaT
SJACT003_D,,NaT,NaT,,NaT,,NaT
...,...,...,...,...,...,...,...
SJWLM031171_G1,,NaT,NaT,,NaT,,NaT
SJWLM031295_D1,Alive,2018-09-07,NaT,50.2,2018-09-07,50.2,2022-10-22
SJWLM031295_G1,,NaT,NaT,,NaT,,NaT
SJWLM031308_D1,Alive,2019-04-04,NaT,43.233333,2019-04-04,43.233333,2022-10-22


In [177]:
# Ependymomas with RNA-seq and survival data
df = import_opentarget_histologies_files()
df[(df.pathology_diagnosis=="Ependymoma") & (~df.OS_days.isna()) & (df.experimental_strategy == 'RNA-Seq')]

  df = pd.read_csv(path,sep='\t')


Unnamed: 0,Kids_First_Biospecimen_ID,sample_id,aliquot_id,Kids_First_Participant_ID,experimental_strategy,sample_type,composition,tumor_descriptor,primary_site,reported_gender,race,ethnicity,age_at_diagnosis_days,pathology_diagnosis,RNA_library,OS_days,OS_status,EFS_days,cohort,age_last_update_days,seq_center,cancer_predispositions,pathology_free_text_diagnosis,cohort_participant_id,extent_of_tumor_resection,CNS_region,gtex_group,gtex_subgroup,germline_sex_estimate,normal_fraction,tumor_fraction,tumor_ploidy,age_at_event_days,clinical_status_at_event,age_at_chemo_start,age_at_radiation_start,cell_line_composition,cell_line_passage,tumor_fraction_RFpurify_ABSOLUTE,tumor_fraction_RFpurify_ESTIMATE,tumor_fraction_LUMP,dkfz_v11_methylation_subclass,dkfz_v11_methylation_subclass_score,dkfz_v12_methylation_subclass,dkfz_v12_methylation_subclass_score,dkfz_v12_methylation_mgmt_status,dkfz_v12_methylation_mgmt_estimated,molecular_subtype,integrated_diagnosis,Notes,harmonized_diagnosis,molecular_subtype_methyl,broad_histology,short_histology,cancer_group
9,BS_00W5QB9S,7316-6972,1202560,PT_K84J9401,RNA-Seq,Tumor,Solid Tissue,Progressive,Cerebellum/Posterior Fossa,Male,Asian,Not Hispanic or Latino,3192.0,Ependymoma,poly-A stranded,1054.0,LIVING,192.0,PBTA,4246.0,,None documented,"anaplastic ependymoma, who grade iii",C2803539,Gross/Near total resection,Posterior fossa,,,Male,,,,3384.0,Alive,3482.0,3419.0,,,,,,,,,,,,"EPN, To be classified",,,Ependymoma,,Ependymal tumor,Ependymoma,Ependymoma
76,BS_07ANYSYQ,7316-2134,571441,PT_S4H6KA09,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Frontal Lobe,Female,White,Not Hispanic or Latino,1412.0,Ependymoma,stranded,2225.0,LIVING,1090.0,PBTA,3637.0,NantOmics,None documented,anaplastic ependymoma who grade iii,C522750,Gross/Near total resection,Hemispheric,,,Female,,,,1412.0,Alive,,1460.0,,,,,,,,,,,,"EPN, ST ZFTA","Supratentorial ependymoma, ZFTA fusion-positive",Updated via OpenPedCan subtyping,"Supratentorial ependymoma, ZFTA fusion-positive","EPN, ST ZFTA",Ependymal tumor,Ependymoma,Ependymoma
115,BS_0BXY0F9N,7316-1078,588352,PT_164RNWTT,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa,Male,Asian,Not Hispanic or Latino,2549.0,Ependymoma,stranded,560.0,LIVING,560.0,PBTA,3109.0,NantOmics,None documented,ependymoma grade iii anaplastic,C291633,Partial resection,Posterior fossa,,,Male,,,,2549.0,Alive,,,,,,,,,,,,,,"EPN, PF A",Posterior fossa group A (PFA) ependymoma,Updated via OpenPedCan subtyping,Posterior fossa group A (PFA) ependymoma,"EPN, PF A",Ependymal tumor,Ependymoma,Ependymoma
228,BS_0QYS36NR,7316-455,588317,PT_V3Q78E6F,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Ventricles,Female,White,Hispanic or Latino,337.0,Ependymoma,stranded,416.0,LIVING,416.0,PBTA,753.0,NantOmics,None documented,ependymoma,C96309,Partial resection,Ventricles,,,Female,,,,337.0,Alive,,377.0,,,,,,,,,,,,"EPN, To be classified",,,Ependymoma,,Ependymal tumor,Ependymoma,Ependymoma
285,BS_0WQJP6ZG,7316-425,470441,PT_Y6Y9JJ9P,RNA-Seq,Tumor,Solid Tissue,Progressive,Frontal Lobe,Female,Asian,Unavailable,2271.0,Ependymoma,stranded,882.0,DECEASED,385.0,PBTA,3153.0,BGI@CHOP Genome Center,None documented,anaplastic ependymoma who iii,C657435,Gross/Near total resection,Hemispheric,,,,,,,3013.0,Alive,3028.0,2696.0,,,,,,,,,,,,"EPN, ST ZFTA","Supratentorial ependymoma, ZFTA fusion-positive",Updated via OpenPedCan subtyping,"Supratentorial ependymoma, ZFTA fusion-positive","EPN, ST ZFTA",Ependymal tumor,Ependymoma,Ependymoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10137,BS_YE1MAQYJ,7316-3319,711675,PT_ZZRBX5JT,RNA-Seq,Tumor,Solid Tissue,Recurrence,Spinal Cord- Lumbar/Thecal Sac,Female,White,Not Hispanic or Latino,1878.0,Ependymoma,stranded,1963.0,LIVING,789.0,PBTA,3841.0,NantOmics,None documented,ependymoma,C121647,Gross/Near total resection,Spine,,,Female,,,,2967.0,Alive,3332.0,3295.0,,,,,,,,,,,,"EPN, PF A",Posterior fossa group A (PFA) ependymoma,Updated via OpenPedCan subtyping,Posterior fossa group A (PFA) ependymoma,"EPN, PF A",Ependymal tumor,Ependymoma,Ependymoma
10413,BS_Z7890YNR,7316-451,571397,PT_99S5BPE3,RNA-Seq,Tumor,Solid Tissue,Recurrence,Frontal Lobe;Parietal Lobe;Temporal Lobe,Male,White,Not Hispanic or Latino,6695.0,Ependymoma,stranded,2066.0,DECEASED,629.0,PBTA,8761.0,NantOmics,None documented,ependymoma,C20172,Partial resection,Hemispheric,,,Male,,,,8339.0,Alive,7362.0,6724.0,,,,,,,,,,,,"EPN, ST ZFTA","Supratentorial ependymoma, ZFTA fusion-positive",Updated via OpenPedCan subtyping,"Supratentorial ependymoma, ZFTA fusion-positive","EPN, ST ZFTA",Ependymal tumor,Ependymoma,Ependymoma
10463,BS_ZC1194Z8,7316-7959,1202538,PT_X7GTQR9N,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Cerebellum/Posterior Fossa;Ventricles,Female,White,Not Hispanic or Latino,1014.0,Ependymoma,poly-A stranded,734.0,LIVING,734.0,PBTA,1748.0,,None documented,posterior fossa ependymoma group pfa (by ihc),C3512757,Gross/Near total resection,Mixed,,,Female,,,,1014.0,Alive,1119.0,1044.0,,,,,,,,,,,,"EPN, PF A",Posterior fossa group A (PFA) ependymoma,Updated via OpenPedCan subtyping,Posterior fossa group A (PFA) ependymoma,"EPN, PF A",Ependymal tumor,Ependymoma,Ependymoma
10521,BS_ZJC7VQNA,7316-6143,1202555,PT_6N4TJ9G3,RNA-Seq,Tumor,Solid Tissue,Initial CNS Tumor,Ventricles,Male,White,Not Hispanic or Latino,6673.0,Ependymoma,poly-A stranded,917.0,LIVING,917.0,PBTA,7590.0,,Other inherited conditions NOS,"ependymoma, who grade ii",C2694930,Gross/Near total resection,Ventricles,,,Male,,,,6673.0,Alive,,,,,,,,,,,,,,"EPN, PF B",Posterior fossa group B (PFB) ependymoma,Updated via OpenPedCan subtyping,Posterior fossa group B (PFB) ependymoma,"EPN, PF B",Ependymal tumor,Ependymoma,Ependymoma
