## Firecloud: Uploading Metadata to Firecloud
Mimoun Cadosch 7/17

In [1]:
from firecloud import api as firecloud_api
import pandas as pd
import os
from IPython.display import display

In [2]:
## Resources
# https://github.com/broadinstitute/fiss/blob/master/firecloud/api.py
# https://github.com/broadinstitute/firecloud-tools

In [3]:
# res = firecloud_api.delete_workspace_config(namespace, workspace, "tsca", "Mutect2TumorOnly")
# res = firecloud_api.delete_workspace_config(namespace, workspace, 'tsca', 'CreatePanelOfNormalsGATK')
# delete_sample(namespace, workspace, "AA66-Tumor-SM-F29RQ")
# res = delete_sample_set(namespace, workspace, sample_set_id='CumPoN_1920')
# firecloud_api.delete_repository_method('tsca', 'CallSomaticCNV', 10)

In [4]:
def delete_sample(namespace, workspace, sample_id):
    """Delete sample from workspace/namespace
    Args: 
        Self-explanatory
    Returns: 
        HTTP Response
    """
    body = [{"entityType": "sample", "entityName": sample_id}]
    res = firecloud_api.delete_entities(namespace, workspace, body)
    return res

In [5]:
def delete_sample_set(namespace, workspace, sample_set_id):
    """Delete sample set from workspace/namespace
    Args: 
        Self-explanatory
    Returns: 
        HTTP Response
    """
    body = [{"entityType": "sample_set", "entityName": sample_set_id}]
    res = firecloud_api.delete_entities(namespace, workspace, body)
    return res

In [6]:
def delete_workspace_config(namespace, workspace, cnamespace, config):
    """Delete workspace configuration
    Args: 
        Self-explanatory
    Returns: 
        HTTP Response
    """
    res = firecloud_api.delete_workspace_config(namespace, workspace, cnamespace, config)
    return res

In [7]:
def upload_entities_from_tsv(namespace, workspace, entities_tsv_file):
    """Upload entities from tsv file
    Args: 
        Self-explanatory
        entities_tsv_file: path to tsv file
    Returns: 
        HTTP Response
    """
    res = firecloud_api.upload_entities_tsv(namespace, workspace, entities_tsv=entities_tsv_file)
    return res

In [8]:
def patients_for_metadata_import(path, tsca_id):
    """Create participant entities in Firecloud. 
    Patients need to exist before you can upload their respective samples
    Args:
        path_id: path to file ending in {}.import_samples.txt
        tsca_id: tsca id
    Pending:
        Are we allowed to add age, gender, race?
    Saves: 
        ./tsca_id/fc_upload_patients_tsca_{tsca_id}.csv:
            contains patient ids in tsca batch
    """    
    raw = pd.read_table(path)
    print( "%d Participants in this batch" % raw['individual_id'].unique().shape[0] )
    # Data to upload
    data = pd.DataFrame(raw.individual_id.drop_duplicates()).rename(columns={'individual_id':'entity:participant_id'})
    os.system('mkdir -p %s'%tsca_id)
    filename = './%s/fc_upload_patients_tsca_%s.txt' % (tsca_id, tsca_id)
    data.to_csv(filename, '\t', index=False)

In [9]:
def batch_sample_set_for_metadata_import(path, tsca_id):
    """Create sample_set entities in Firecloud.
    A sample for a given batch 
    Args:
        path: path to file ending in {}.import_samples.txt
        tsca_id: batch tsca id
    """
    raw = pd.read_table(path)
    print( "%d Samples in this batch" % raw.shape[0] )

    # Data to upload
    data = pd.concat([pd.DataFrame(index=raw.index, columns=['membership:sample_set_id'], data=tsca_id), \
                      raw.sample_id], axis=1)
    os.system('mkdir -p %s'%tsca_id)
    filename = './%s/fc_upload_sample_set_tsca_%s.txt' % (tsca_id, tsca_id)
    data.to_csv(filename, '\t', index=False)

In [10]:
def batch_samples_for_metadata_import(path, tsca_id, google_bucket_id):
    """Prepare the file to import samples metadata to firecloud
    Args:
        path_id: path to file ending in {}.import_samples.txt
        tsca_id: TSCAXX
        google_bucket_id: id of google bucket ('gs://google_bucket_id')
    Returns:
        pd.DF of data ready for import
    Saves:
        ./{tsca_id}/fc_upload_samples_tsca_{tsca_id}.txt
    """
    # Import raw data
    data = pd.read_table(path)
    
    # Rename columns to match firecloud requirements
    data = data.rename(columns={'sample_id':'entity:sample_id', 'individual_id':'participant_id'})
    
    # Locations of BAM files in google bucket
    path_in_bucket_full = "gs://%s/seq_data/%s" % (google_bucket_id, tsca_id)

    # Extract bam filename
    data['bam_filename'] = data.apply(lambda row: row['clean_bam_file_capture'].split('/')[-1], axis=1)
    
    # Create bai filename (change extension on .bam file)
    data['bai_filename'] = data.apply(lambda row: "%s%s" %(row['bam_filename'][:-3], 'bai'), axis=1)
    
    # Change BAM path from xchip to Google cloud
    data['clean_bam_file_capture'] = \
        data.apply( lambda row: "%s/%s/%s" \
                   %(path_in_bucket_full, row['external_id_validation'], row['bam_filename']), axis=1)
    
    # Add location of .bai file 
    data['clean_bai_file_capture'] = \
        data.apply( lambda row: "%s/%s/%s" \
                   %(path_in_bucket_full, row['external_id_validation'], row['bai_filename']), axis=1)
       
    # Add TSCA ID
    data['tsca_id'] = tsca_id
        
    return data

In [11]:
def panel_of_normals_for_metadata_import(paths, N, name):
    """Create panel of normals sample set for Firecloud from multiple TSCA batches
    Args:
        paths: (list) paths to file ending in {}.import_samples.txt
        tsca_id: (string) batch tsca id
        N: (int) number of samples in panel of normals
        name: (string) name of Panel of Normals
    """
    df0 = pd.read_table(paths[0])
    dfs = [df0]
    for path in paths[1:]:
        df_to_concat = pd.read_table(path)
        dfs.append(df_to_concat)

    df = pd.concat(dfs, axis=0)
    normals = df[df.sample_type=="Normal"][:N]['sample_id']
    # Shuffle samples to pick from all batches
    normals = normals.sample(frac=1).reset_index(drop=True)
    if N==-1: print ("Creating panel of %d normals" %normals.shape[0])
    else: print ("Creating panel of %d normals" %N)
    
    data = pd.concat([pd.DataFrame(index=normals.index, columns=['membership:sample_set_id'], data=name), \
                        normals], axis=1)

    os.system('mkdir -p PoNs')
    filename = './PoNs/fc_upload_PoN_sample_set_tsca_%s.txt' % (name)
    data.to_csv(filename, '\t', index=False)

In [12]:
def write_df_to_csv(data, tsca_id):
    data.to_csv('%s/fc_upload_samples_tsca_%s.txt' % (tsca_id, tsca_id), sep='\t', index=False)

In [13]:
def compile_samples(paths_to_samples_info, google_bucket_id):
    """Compile all samples from all batches
    Args: Self-explanatory
        - paths_to_samples_info: .xlsx file containing paths to files containing sample_info
    Returns: 
        - df with samples from all batches
    """
    paths_to_samples_info = pd.read_excel(paths_to_samples_info, index_col=0)
    df_list = []

    for tsca_id, paths in paths_to_samples_info.iterrows():
        # Make data Firecloud-compatible
        batch_data = batch_samples_for_metadata_import(paths.path_to_samples_info, tsca_id, google_bucket_id)
        df_list.append(batch_data)

    all_samples = pd.concat(df_list, axis=0)
    return all_samples

In [14]:
def add_matching_samples(all_samples, batch_samples):
    """Add sample_id and bam filepath of matching normals and primary tumor tissue for every sample
    Args:
        - all_samples: df with target samples we want to find matches in
        - batch_samples: df with source samples we want to find matches for
    Returns: 
        - batch_samples (augmented)
    """
    for index, row in batch_samples.iterrows():
        # Find all samples from same individual (same individual_id, different sample_id)
        patient_samples = all_samples[ (all_samples['participant_id'] == row['participant_id']) \
                                      & (all_samples['entity:sample_id'] != row['entity:sample_id']) ]

        # NOTE: If more than one match tumor tissue or match normal found, select one at random.
        # The match normal is used to compute allelic fractions in Mutect2, so for now we ignore the conditions it was grown in.

        # Tumor tissue: Add primary tumor tissue
        match_primary_tumor = patient_samples[ patient_samples['external_id_validation'] \
                                              .str.contains('primary|prim|tissue|tiss') ]
        #    > No primary tumor tissue found
        if match_primary_tumor.empty:
            batch_samples.loc[index, 'match_primary_tumor_sample_id'] = "NA"
            batch_samples.loc[index, 'match_primary_tumor_bam_file'] = "NA"
        #    > Tumor tissue found
        elif match_primary_tumor.shape[0] > 0:
            match_primary_tumor = match_primary_tumor.sample(n=1)
            batch_samples.loc[index, 'match_primary_tumor_sample_id'] = match_primary_tumor['entity:sample_id'].item()
            batch_samples.loc[index, 'match_primary_tumor_bam_file'] = match_primary_tumor['clean_bam_file_capture'].item()

        # Add match normal
        match_normal = patient_samples[ patient_samples['sample_type'] == "Normal"]
        #   > No match normal found
        if match_normal.empty: 
            batch_samples.loc[index, 'match_normal_sample_id'] = "NA"
            batch_samples.loc[index, 'match_normal_bam_file'] = "NA"
        #   > Match normal found
        elif match_normal.shape[0] > 0:
            match_normal = match_normal.sample(n=1)
            batch_samples.loc[index, 'match_normal_sample_id'] = match_normal['entity:sample_id'].item()
            batch_samples.loc[index, 'match_normal_bam_file'] = match_normal['clean_bam_file_capture'].item()
            
    return batch_samples

In [15]:
def prepare_all_metadata(tsca_id, path_to_samples_info):    
    patients_for_metadata_import(path_to_samples_info, tsca_id)
    batch_sample_set_for_metadata_import(path_to_samples_info, tsca_id)
    batch_samples = batch_samples_for_metadata_import(path_to_samples_info, tsca_id, google_bucket_id)
    all_samples = compile_samples('paths_to_samples_info.xlsx', google_bucket_id)
    batch_samples_with_normal = add_matching_samples(all_samples, batch_samples)
    write_df_to_csv(batch_samples_with_normal, tsca_id)

In [16]:
def export_metadata(tsca_id):
    """Export metadata into Firecloud
    Args: 
        - tsca_id
    """
    patient_metadata    = "%s/fc_upload_patients_tsca_%s.txt" % (tsca_id, tsca_id)
    sample_set_metadata = "%s/fc_upload_sample_set_tsca_%s.txt" % (tsca_id, tsca_id)
    samples_metadata    = "%s/fc_upload_samples_tsca_%s.txt" % (tsca_id, tsca_id)
    pon_metadata = "PoNs/fc_upload_PoN_sample_set_tsca_%s_PoN.txt" %(tsca_id)

    # Upload metadata
    r1 = upload_entities_from_tsv(namespace, workspace, patient_metadata)
    r2 = upload_entities_from_tsv(namespace, workspace, samples_metadata)
    r3 = upload_entities_from_tsv(namespace, workspace, sample_set_metadata)
    # r4 = upload_entities_from_tsv(namespace, workspace, pon_metadata)
    return

In [17]:
### Create Cumulative PoN
def create_cumulative_pon(paths_to_samples_info, num_normals, pon_name):
    """Create Cumulative PoN
    Args:
        - Self-explanatory
    """
    tscas_in_pon = paths_to_samples_info['path_to_samples_info'].tolist()
    panel_of_normals_for_metadata_import(tscas_in_pon, num_normals, pon_name)
    res = upload_entities_from_tsv(namespace, workspace, 'PoNs/fc_upload_PoN_sample_set_tsca_%s.txt'%pon_name)
    return res

In [20]:
namespace = "nci-mimoun-bi-org"
workspace = "CCLF_TSCA"
google_bucket_id = "fc-35446f22-ea37-483a-bd6c-5e9fc56851ff"
paths_to_samples_info = pd.read_excel('paths_to_samples_info.xlsx')

In [19]:
paths_to_samples_info.loc[7, 'path_to_samples_info']

NameError: name 'paths_to_samples_info' is not defined

### Prepare metadata

In [57]:
prepare_all_metadata('TSCA20', '/xchip/clf/seq_data/processed_for_fh/tsca20_201707_SN0125362/tsca20_201707_SN0125362.import_samples.txt')

59 Participants in this batch
95 Samples in this batch


In [64]:
# ### Create Batch PoN
# for index, value in paths_to_samples_info.iterrows():
#     paths = [value.path_to_samples_info]
#     pon_id = "%s_PoN" % value.tsca_id
#     panel_of_normals_for_metadata_import(paths, -1, pon_id)

In [47]:
# export_metadata('TSCA1213')
# for index, value in paths_to_samples_info.iterrows():
#     export_metadata(value.tsca_id)

In [76]:
# for tsca_id in paths_to_samples_info['tsca_id'].tolist():
#     delete_sample_set(namespace, workspace, "%s_PoN" %tsca_id)

In [83]:
# create_cumulative_pon(paths_to_samples_info, 40, 'CumPon40')

Creating panel of 40 normals


<Response [200]>

In [21]:
# for i, v in paths_to_samples_info.iterrows():
#     cmd = \
#     """
#     #!/bin/bash -l
#     #$ -N %s
#     #$ -j y

#     . /broad/software/scripts/useuse
#     reuse Google-Cloud-SDK
#     gsutil -m cp -r \
#     %s/* \
#     gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/seq_data/%s/
#     """ % (v.tsca_id, v.path_to_samples_info, v.tsca_id)
#     print (cmd)

In [22]:
tsca19 = pd.read_table('/xchip/clf/seq_data/processed_for_fh/tsca19_201706_SN0122601/tsca19_201706_SN0122601.import_samples.txt')