## Notebook to pull file metadata from CAVATICA to match AmpliconArchitect runs to patient and biosample.
### Todo:
- Rename and reorganize AA file outputs and/or...
- Script a name_map file to map AA results to patient and biosample.

### Requires:
`mamba install sevenbridges-python pandas ipykernel` \
`python -m ipykernel install --user --name cavatica --display-name cavatica` \
AmpliconArchitect results are copied to chapmano/pancancer-ecdna/X01-amplicon-architect/amplicon-architect but are also in the CBTN-PBTA dataset.

API docs at https://sevenbridges-python.readthedocs.io/en/latest/quickstart/ \
Examples at https://github.com/sbg/okAPI/tree/master/Recipes/SBPLAT

In [1]:
import sevenbridges as sbg
import pandas as pd
import pathlib
import os
import shutil

pd.set_option('display.max_columns', None)

## X01 dataset

In [43]:
## Setup API, set global variables

# default config location is ~/.sevenbridges/credentials
api = sbg.Api(config=sbg.Config(profile='cavatica'))

PROJECT_ID='chapmano/pancancer-ecdna'

def get_aa_results_directory(cohort="X01"):
    '''
    Get the ID of the directory containing AA results. 
    cohort: may be X01, X00 or PNOC
    '''
    if cohort == "X01":
        # Looking for chapmano/pancancer-ecdna/X01-amplicon-architect/amplicon-architect.
        # idk why this is so hard
        cd = api.files.query(project=PROJECT_ID, names=['X01-amplicon-architect'])
        cd = api.files.query(parent=cd[0], names=['amplicon-architect'])
    elif cohort == "X00":
        # chapmano/pancancer-ecdna/wgs.
        cd = api.files.query(project=PROJECT_ID, names=['wgs'])
    elif cohort == "PNOC":
        # chapmano/pancancer-ecdna/pnoc
        cd = api.files.query(project=PROJECT_ID, names=['pnoc'])
    return cd[0]

AA_RESULTS=get_aa_results_directory("X00")


In [44]:
AA_RESULTS

<File: id=6168b8d891d8b939db563344>

In [58]:
def get_metadata_table(cohort="X01"):
    '''
    Generate the metadata table of WGS corresponding to a sub-cohort. 
    cohort: may be X01, X00 or PNOC
    '''
    AA_RESULTS=get_aa_results_directory(cohort)
    # Assemble table of metadata, one row per file
    metadata = pd.DataFrame()
    files = api.files.query(parent=AA_RESULTS).all()
    for file in files:
        df = pd.DataFrame(data=file.metadata,index=[file.name])
        metadata = pd.concat([metadata,df])
    
    metadata = metadata[["Kids First Biospecimen ID","gender","race","ethnicity","Kids First Participant ID","disease_type","sample_id","Tumor Descriptor","primary_site","age_at_diagnosis","case_id"]]
    metadata['WGS_UUID'] = metadata.index.map(lambda x: x.split('_')[0]) # 1:1 mapping WGS run to biosample.
    metadata['WGS_UUID'] = metadata.index.map(lambda x: x.split('.')[0])
    metadata = metadata.set_index('Kids First Biospecimen ID')
    metadata = metadata.sort_values(["Kids First Participant ID","Tumor Descriptor"])
    metadata = metadata.drop_duplicates()
    return metadata


In [59]:
metadata = get_metadata_table("PNOC")
#print(f'{len(metadata)} files') 3131 files
print(f'{len(metadata.drop_duplicates('Kids First Participant ID'))} unique patients')
#print(f'{len(metadata.drop_duplicates('Kids First Biospecimen ID'))} unique biosamples')
print(f'Unique sample types: {metadata['Tumor Descriptor'].unique()}')
metadata

34 unique patients
Unique sample types: ['Diagnosis' 'Autopsy' 'Progressive Disease Post-Mortem' 'Progressive'
 'Relapse']


Unnamed: 0_level_0,gender,race,ethnicity,Kids First Participant ID,disease_type,sample_id,Tumor Descriptor,primary_site,age_at_diagnosis,case_id,WGS_UUID
Kids First Biospecimen ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BS_8SYN7GXG,Male,White,Not Hispanic or Latino,PT_0MXPTTM3,Anaplastic Astrocytoma,7316-3220-T-A09410.WGS,Diagnosis,Brain Stem,9425,C3080535,3526c1ab-a793-4c5f-9576-d922835dbd78
BS_DRVEFVQ5,Male,Reported Unknown,Reported Unknown,PT_1AAYYGGY,Anaplastic Astrocytoma,7316-4996-T-A14710.WGS,Diagnosis,Brain Stem,4608,C3079920,d182b330-17aa-47de-9c3a-5fee37a4ee33
BS_Y96RP1HJ,Male,Reported Unknown,Reported Unknown,PT_1AAYYGGY,Anaplastic Astrocytoma,7316-4996-T-A14709.WGS,Diagnosis,Brain Stem,4608,C3079920,e3629477-53cc-40c6-a4f6-72ecc70a4451
BS_7GKF6M85,Female,Asian,Not Hispanic or Latino,PT_1E3E6GMF,Diffuse Astrocytoma,7316-3224-T-A09985.WGS,Diagnosis,Brain Stem,2211,C3081150,2865b67b-1458-492e-a39e-57900f34da9f
BS_169P1QCA,Male,White,Not Hispanic or Latino,PT_1YQH5NSH,Diffuse midline glioma; H3K27M mutant; WHO gra...,7316-5922-T-SF11653.WGS,Diagnosis,Brain Stem,2520,C3093819,64b37304-0567-4f2c-9790-b3cce218955c
...,...,...,...,...,...,...,...,...,...,...,...
BS_VXDGXQKZ,Female,Reported Unknown,Reported Unknown,PT_VPEMAQBN,Anaplastic Astrocytoma,7316-3235-T-SF10693.WGS,Diagnosis,Brain Stem,1709,C3078444,7ad281cf-82c0-46b9-9a23-8348d623ba77
BS_38CD519Z,Male,Asian,Hispanic or Latino,PT_W5GP3F6B,Diffuse midline glioma; H3K27M mutant; WHO gra...,7316-5003-T-A16915.WGS,Diagnosis,Brain Stem,2459,C3092712,80fd23ab-de59-45da-80f6-49af162fe982
BS_4DQAQFQH,Female,Reported Unknown,Hispanic or Latino,PT_WGVEF96B,Diffuse Astrocytoma,7316-4446-T-SF10438.WGS,Diagnosis,Brain Stem,2853,C3080043,3d9ec140-cf1f-48e0-b4bd-d9ae775b29b0
BS_TQ0J7WJQ,Female,Reported Unknown,Hispanic or Latino,PT_WGVEF96B,Diffuse Astrocytoma,7316-3219-T-A08958.WGS,Diagnosis,Brain Stem,2853,C3080043,c2e040bd-64b7-4928-8428-1da909cb2873


In [60]:
# Setup the output directory
OUT_DIR = pathlib.Path(pathlib.Path.cwd(),"out")
def makedirs(path):
    if not os.path.exists(path):
        # Create the directory
        os.makedirs(path)
    
# Write metadata table
def write_metadata_table(metadata,filename="X01-biosample-metadata.tsv"):
    makedirs(OUT_DIR)
    file_path = pathlib.Path(OUT_DIR,filename)
    metadata.to_csv(file_path,sep='\t')

# Write name_map file for AmpliconClassifier
def write_name_map(metadata,filename="X01-name-map.txt"):
    makedirs(OUT_DIR)
    file_path = pathlib.Path(OUT_DIR,filename)
    name_map = pd.Series(index=metadata["WGS_UUID"],
                         data=(metadata["Kids First Participant ID"] + "-" + metadata.index).values)
    name_map.to_csv(file_path,sep='\t',header=False)
    return(name_map)

In [61]:
metadata = get_metadata_table("PNOC")
write_metadata_table(metadata,"PNOC-biosample-metadata.tsv")
#write_name_map(metadata)

In [21]:
# Rename all the files anyway
def rename_AA_outputs(indir, outdir, metadata):
    # map UUIDs (useless) to patient and biosample names.
    metadata = metadata[["WGS_UUID","Kids First Participant ID","Kids First Biospecimen ID"]].set_index("WGS_UUID")

    # Traverse through each file
    file_list = os.listdir(indir)
    for filename in file_list:
        file_path = pathlib.Path(indir, filename)
        if os.path.isfile(file_path):
            uuid = filename.split('_')[0]
            patient_id = metadata.loc[uuid,"Kids First Participant ID"]
            biosample_id = metadata.loc[uuid,"Kids First Biospecimen ID"]
            dest = pathlib.Path(outdir, patient_id, biosample_id)
            makedirs(dest)
            shutil.copy2(file_path,pathlib.Path(dest,filename.replace(uuid,patient_id+"-"+biosample_id)))


indir = pathlib.Path(pathlib.Path.cwd(),"data","amplicon-architect")
outdir = pathlib.Path(pathlib.Path.cwd(),"out","amplicon-architect")
#rename_AA_outputs(indir, outdir, metadata)