In [None]:
# Rerun AA on merged sorted seed files of all samples with shared ancestry.

In [None]:
# ID all patients with more than one biosample
# Get all seed bed files
# Merge and sort seed beds
# Rerun all AA with shared bed
# CAVATICA python API client docs at https://sevenbridges-python.readthedocs.io/en/latest/sevenbridges.html

In [None]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
from collections import OrderedDict

sys.path.append('../src')
Path("out").mkdir(parents=True, exist_ok=True)

import data_imports

import sevenbridges as sbg
import pathlib
import os
import shutil

pd.set_option('display.max_columns', None)

In [None]:
# GLOBALS

# default config location is ~/.sevenbridges/credentials
api = sbg.Api(config=sbg.Config(profile='cavatica'))

PROJECT_ID='chapmano/pancancer-ecdna'
APP = 'chapmano/pancancer-ecdna/ampliconsuite-grouped-cram'
BIOSAMPLES = data_imports.import_biosamples()
IN_REFERENCE=api.files.get(id='615cd166ec5e113414f33c0d')
MOSEK_DIR=api.files.get(id='616e08dfaa99d11c4bb0dd59')
DATA_REPO=api.files.get(id='6181b500ba246329d9313cf4')
REFERENCE_FILE_LIST=api.files.get(id='615cd166ec5e113414f33c0b')

tumor_wgs_dir = api.files.query(project=PROJECT_ID, names=['wgs'])[0]
tumor_wgs_files = list(api.files.query(parent=tumor_wgs_dir).all())

normal_wgs_dir = api.files.query(project=PROJECT_ID, names=['normal-wgs'])[0]
normal_wgs_files = list(api.files.query(parent=normal_wgs_dir).all())

In [None]:
def submit_grouped_job(pt,verbose=False, dry_run=False):
    # create list of sample records
    samples = []
    for file in tumor_wgs_files:
        if file.name.endswith('.cram') and file.metadata['Kids First Participant ID'] == pt:
            samples.append({'sample_name':file.metadata['Kids First Biospecimen ID'],'cram':file,'tumor_normal':'tumor'})
    for file in normal_wgs_files:
        if file.name.endswith('.cram') and file.metadata['Kids First Participant ID'] == pt:
            samples.append({'sample_name':file.metadata['Kids First Biospecimen ID'],'cram':file,'tumor_normal':'normal'})

    # Set inputs
    inputs = {
        'in_reference' : IN_REFERENCE,
        'mosek_dir' : MOSEK_DIR,
        'data_repo' : DATA_REPO,
        'reference_file_list' : REFERENCE_FILE_LIST,
        'ref' : 'hg38',
        'cngain' : 4.5,
        'output_dir' : pt,
        'input_record' : samples,
    }
    execution_settings =  {
        'use_memoization' : True,
    }
    if dry_run:
        task = api.tasks.create(name=pt+"-grouped", project=PROJECT_ID, app=APP, inputs=inputs, execution_settings=execution_settings, interruptible=False, run=False)
        api.tasks.delete(task)
    else:
        task = api.tasks.create(name=pt+"-grouped", project=PROJECT_ID, app=APP, inputs=inputs, execution_settings = execution_settings, run=True)
    if verbose:
        print(f'creating task for {pt}:\n{inputs}')
    return file

In [None]:
asdf = submit_grouped_job('PT_CXT81GRM',verbose=True,dry_run=True)

In [None]:
pd.set_option('display.max_rows', 5)
SECONDARIES = ['Diagnosis','Progressive','Autopsy','Recurrence','Relapse','Metastasis'] #exclude second malignancies, no sample, unavailable
def get_cbtn_pairs():
    '''
    We define a longitudinal case from CBTN which has samples with different dates of diagnosis.
    '''
    df = BIOSAMPLES[BIOSAMPLES.cohort.isin(["PBTA-X00","PBTA-X01"]) &
                    (BIOSAMPLES.tumor_history.isin(SECONDARIES)) &
                    (BIOSAMPLES.duplicated('patient_id',keep=False))]
    grp = df.groupby('patient_id').filter(lambda x: x['age_at_diagnosis'].max()-x['age_at_diagnosis'].min()>=30).sort_values(["patient_id","age_at_diagnosis"])
    return grp
def get_longitudinal_cases(verbose=True):
    df = get_cbtn_pairs()
    if verbose:
        a = df.patient_id.nunique()
        b = df[df.amplicon_class == 'ecDNA'].patient_id.nunique()
        print(f"{b} of {a} longitudinal cases have ecDNA")
    return df
def submit_longitudinal_samples(skip=[]):
    s = get_longitudinal_cases()
    ct=0
    # For each patient with more than 1 tumor sample
    for pt, df in s.groupby('patient_id'):
        b = (df.amplicon_class == 'ecDNA').any()
        if (not b) or (pt in skip):
            continue
        submit_grouped_job(pt,verbose=True,dry_run=False)
        ct+=1
    return ct
ct = submit_longitudinal_samples(skip=['PT_CXT81GRM','PT_00G007DM'])
print(ct)

In [None]:
# This would be at least 700 AA reruns - probably excessive
def submit_all_multitumor_samples():
    ct=0
    # For each patient with more than 1 tumor sample
    for pt, df in BIOSAMPLES.groupby('patient_id'):
        if len(df) < 2:
            continue
        submit_grouped_job(pt,verbose=False,dry_run=True)
        ct+=1
    return ct
ct = submit_all_multitumor_samples()
print(ct)