## Notebook to pull file metadata from CAVATICA to match AmpliconArchitect runs to patient and biosample.
### Todo:
- Rename and reorganize AA file outputs and/or...
- Script a name_map file to map AA results to patient and biosample.

### Requires:
`mamba install sevenbridges-python pandas ipykernel` \
`python -m ipykernel install --user --name sevenbridges --display-name sevenbridges` \
AmpliconArchitect results are copied to chapmano/pancancer-ecdna/X01-amplicon-architect/amplicon-architect but are also in the CBTN-PBTA dataset.

API docs at https://sevenbridges-python.readthedocs.io/en/latest/quickstart.html \
Examples at https://github.com/sbg/okAPI/tree/master/Recipes/SBPLAT

In [None]:
import sevenbridges as sbg
import pandas as pd
import pathlib
import os
import shutil

pd.set_option('display.max_columns', None)

In [None]:
## Setup API, set global variables

# default config location is ~/.sevenbridges/credentials
api = sbg.Api(config=sbg.Config(profile='cavatica'))

PROJECT_ID='chapmano/pancancer-ecdna'

# Setup the output directory
OUT_DIR = pathlib.Path(pathlib.Path.cwd(),"out")
def makedirs(path):
    if not os.path.exists(path):
        # Create the directory
        os.makedirs(path)


## Download X01 dataset AA results

In [None]:
def get_aa_results_directory(cohort="X01"):
    '''
    Get the ID of the directory containing AA results. 
    cohort: may be X01, X00 or PNOC
    '''
    if cohort == "X01":
        # Looking for chapmano/pancancer-ecdna/X01-amplicon-architect/amplicon-architect.
        # idk why this is so hard
        cd = api.files.query(project=PROJECT_ID, names=['X01-amplicon-architect'])
        cd = api.files.query(parent=cd[0], names=['amplicon-architect'])
    elif cohort == "X00":
        # chapmano/pancancer-ecdna/wgs.
        cd = api.files.query(project=PROJECT_ID, names=['wgs'])
    elif cohort == "PNOC":
        # chapmano/pancancer-ecdna/pnoc
        cd = api.files.query(project=PROJECT_ID, names=['pnoc'])
    return cd[0]

AA_RESULTS=get_aa_results_directory("X00")

In [None]:
AA_RESULTS

In [None]:
def get_metadata_table(cohort="X01"):
    '''
    Generate the metadata table of WGS corresponding to a sub-cohort. 
    cohort: may be X01, X00 or PNOC
    '''
    AA_RESULTS=get_aa_results_directory(cohort)
    # Assemble table of metadata, one row per file
    metadata = pd.DataFrame()
    files = api.files.query(parent=AA_RESULTS).all()
    for file in files:
        df = pd.DataFrame(data=file.metadata,index=[file.name])
        metadata = pd.concat([metadata,df])
    
    metadata = metadata[["Kids First Biospecimen ID","gender","race","ethnicity","Kids First Participant ID","disease_type","sample_id","Tumor Descriptor","primary_site","age_at_diagnosis","case_id"]]
    metadata['WGS_UUID'] = metadata.index.map(lambda x: x.split('_')[0]) # 1:1 mapping WGS run to biosample.
    metadata['WGS_UUID'] = metadata.index.map(lambda x: x.split('.')[0])
    metadata = metadata.set_index('Kids First Biospecimen ID')
    metadata = metadata.sort_values(["Kids First Participant ID","Tumor Descriptor"])
    metadata = metadata.drop_duplicates()
    return metadata


In [None]:
metadata = get_metadata_table("PNOC")
#print(f'{len(metadata)} files') 3131 files
print(f'{len(metadata.drop_duplicates('Kids First Participant ID'))} unique patients')
#print(f'{len(metadata.drop_duplicates('Kids First Biospecimen ID'))} unique biosamples')
print(f'Unique sample types: {metadata['Tumor Descriptor'].unique()}')
metadata

In [None]:
# Write metadata table
def write_metadata_table(metadata,filename="X01-biosample-metadata.tsv"):
    makedirs(OUT_DIR)
    file_path = pathlib.Path(OUT_DIR,filename)
    metadata.to_csv(file_path,sep='\t')

# Write name_map file for AmpliconClassifier
def write_name_map(metadata,filename="X01-name-map.txt"):
    makedirs(OUT_DIR)
    file_path = pathlib.Path(OUT_DIR,filename)
    name_map = pd.Series(index=metadata["WGS_UUID"],
                         data=(metadata["Kids First Participant ID"] + "-" + metadata.index).values)
    name_map.to_csv(file_path,sep='\t',header=False)
    return(name_map)

In [None]:
metadata = get_metadata_table("PNOC")
write_metadata_table(metadata,"PNOC-biosample-metadata.tsv")
#write_name_map(metadata)

In [None]:
# Rename all the files anyway
def rename_AA_outputs(indir, outdir, metadata):
    # map UUIDs (useless) to patient and biosample names.
    metadata = metadata[["WGS_UUID","Kids First Participant ID","Kids First Biospecimen ID"]].set_index("WGS_UUID")

    # Traverse through each file
    file_list = os.listdir(indir)
    for filename in file_list:
        file_path = pathlib.Path(indir, filename)
        if os.path.isfile(file_path):
            uuid = filename.split('_')[0]
            patient_id = metadata.loc[uuid,"Kids First Participant ID"]
            biosample_id = metadata.loc[uuid,"Kids First Biospecimen ID"]
            dest = pathlib.Path(outdir, patient_id, biosample_id)
            makedirs(dest)
            shutil.copy2(file_path,pathlib.Path(dest,filename.replace(uuid,patient_id+"-"+biosample_id)))


indir = pathlib.Path(pathlib.Path.cwd(),"data","amplicon-architect")
outdir = pathlib.Path(pathlib.Path.cwd(),"out","amplicon-architect")
#rename_AA_outputs(indir, outdir, metadata)

## Download AmpliconSuite-Grouped results

In [None]:
ASG_dir_id='66ba7831a4373d49ee84e4e1'

def download_asg():
    # create directory at ./out/ASG
    out_dir = pathlib.Path(OUT_DIR,"ASG")
    makedirs(out_dir)
    # download all results to out/ASG
    files = api.files.query(parent=ASG_dir_id).all()
    for file in files:
        z = pathlib.Path(out_dir,file.name)
        unz = pathlib.Path(out_dir,z.stem)
        try:
            file.download(path=str(z))
        except sbg.LocalFileAlreadyExists:
            print(f'zip file {file.name} already exists')
        shutil.unpack_archive(filename=z,extract_dir=unz)
        
        break
    
download_asg()