## Firecloud: Uploading Metadata to Firecloud
Mimoun Cadosch 7/17

In [1]:
from firecloud import api as firecloud_api
import pandas as pd
import os, sys
import datetime
from IPython.display import display

In [4]:
## Resources
# https://github.com/broadinstitute/fiss/blob/master/firecloud/api.py
# https://github.com/broadinstitute/firecloud-tools

In [5]:
# res = firecloud_api.delete_workspace_config(namespace, workspace, "tsca", "Mutect2TumorOnly")
# res = firecloud_api.delete_workspace_config(namespace, workspace, 'tsca', 'CreatePanelOfNormalsGATK')
# delete_sample(namespace, workspace, "AA66-Tumor-SM-F29RQ")
# res = delete_sample_set(namespace, workspace, sample_set_id='CumPoN_1920')
# firecloud_api.delete_repository_method('tsca', 'CallSomaticCNV', 10)

In [32]:
namespace = "nci-mimoun-bi-org"
workspace = "CCLF_TSCA"
google_bucket_id = "fc-35446f22-ea37-483a-bd6c-5e9fc56851ff"
path_to_all_samples_info = "paths_to_samples_info.xlsx"
# batches_info = pd.read_excel('paths_to_samples_info.xlsx')
# Must be True
filesystem_mounted = os.path.ismount('/xchip/clf/')

In [15]:
def delete_sample(namespace, workspace, sample_id):
    """Delete sample from workspace/namespace
    Args: 
        Self-explanatory
    Returns: 
        HTTP Response
    """
    body = [{"entityType": "sample", "entityName": sample_id}]
    res = firecloud_api.delete_entities(namespace, workspace, body)
    return res

In [16]:
def delete_sample_set(namespace, workspace, sample_set_id):
    """Delete sample set from workspace/namespace
    Args: 
        Self-explanatory
    Returns: 
        HTTP Response
    """
    body = [{"entityType": "sample_set", "entityName": sample_set_id}]
    res = firecloud_api.delete_entities(namespace, workspace, body)
    return res

In [17]:
def delete_workspace_config(namespace, workspace, cnamespace, config):
    """Delete workspace configuration
    Args: 
        Self-explanatory
    Returns: 
        HTTP Response
    """
    res = firecloud_api.delete_workspace_config(namespace, workspace, cnamespace, config)
    return res

In [18]:
def delete_entity_attributes(namespace, workspace, entity_type, entity_name, attrs):
    """Delete entity attributes
    Args: 
    - updates: list of attributes to delete
    """
    attr_update = [{"op": "RemoveAttribute", "attributeName":  attr} for attr in attrs]
    res = firecloud_api.update_entity(namespace, workspace, entity_type, entity_name, attr_update)
    return res

In [19]:
def upload_entities_from_tsv(namespace, workspace, entities_tsv_file):
    """Upload entities from tsv file
    Args: 
        Self-explanatory
        entities_tsv_file: path to tsv file
    Returns: 
        HTTP Response
    """
    res = firecloud_api.upload_entities_tsv(namespace, workspace, entities_tsv=entities_tsv_file)
    return res

In [20]:
def patients_for_metadata_import(path, tsca_id):
    """Create participant entities in Firecloud. 
    Patients need to exist before you can upload their respective samples
    Args:
        path_id: path to file ending in {}.import_samples.txt
        tsca_id: tsca id
    Pending:
        Are we allowed to add age, gender, race?
    Saves: 
        ./tsca_id/fc_upload_patients_tsca_{tsca_id}.csv:
            contains patient ids in tsca batch
    """    
    raw = pd.read_table(path)
    print( "%d Participants in this batch" % raw['individual_id'].unique().shape[0] )
    # Data to upload
    data = pd.DataFrame(raw.individual_id.drop_duplicates()).rename(columns={'individual_id':'entity:participant_id'})
    os.system('mkdir -p %s'%tsca_id)
    filename = './%s/fc_upload_patients_tsca_%s.txt' % (tsca_id, tsca_id)
    data.to_csv(filename, '\t', index=False)

In [21]:
def batch_sample_set_for_metadata_import(path, tsca_id):
    """Create sample_set entities in Firecloud.
    A sample for a given batch 
    Args:
        path: path to file ending in {}.import_samples.txt
        tsca_id: batch tsca id
    """
    raw = pd.read_table(path)
    print( "%d Samples in this batch" % raw.shape[0] )

    # Data to upload
    data = pd.concat([pd.DataFrame(index=raw.index, columns=['membership:sample_set_id'], data=tsca_id), \
                      raw.sample_id], axis=1)
    os.system('mkdir -p %s'%tsca_id)
    filename = './%s/fc_upload_sample_set_tsca_%s.txt' % (tsca_id, tsca_id)
    data.to_csv(filename, '\t', index=False)

In [22]:
def batch_samples_for_metadata_import(path, tsca_id, google_bucket_id):
    """Prepare the file to import samples metadata to firecloud
    Args:
        path_id: path to file ending in {}.import_samples.txt
        tsca_id: TSCAXX
        google_bucket_id: id of google bucket ('gs://google_bucket_id')
    Returns:
        pd.DF of data ready for import
    Saves:
        ./{tsca_id}/fc_upload_samples_tsca_{tsca_id}.txt
    """
    # Import raw data
    data = pd.read_table(path)
    
    # Rename columns to match firecloud requirements
    data = data.rename(columns={'sample_id':'entity:sample_id', 'individual_id':'participant_id'})
    
    # Locations of BAM files in google bucket
    path_in_bucket_full = "gs://%s/seq_data/%s" % (google_bucket_id, tsca_id)

    # Extract bam filename
    data['bam_filename'] = data.apply(lambda row: row['clean_bam_file_capture'].split('/')[-1], axis=1)
    
    # Create bai filename (change extension on .bam file)
    data['bai_filename'] = data.apply(lambda row: "%s%s" %(row['bam_filename'][:-3], 'bai'), axis=1)
    
    # Change BAM path from xchip to Google cloud
    data['clean_bam_file_capture'] = \
        data.apply( lambda row: "%s/%s/%s" \
                   %(path_in_bucket_full, row['external_id_validation'], row['bam_filename']), axis=1)
    
    # Add location of .bai file 
    data['clean_bai_file_capture'] = \
        data.apply( lambda row: "%s/%s/%s" \
                   %(path_in_bucket_full, row['external_id_validation'], row['bai_filename']), axis=1)
       
    # Add TSCA ID
    data['tsca_id'] = tsca_id
        
    return data

In [23]:
def panel_of_normals_for_metadata_import(paths, N, name):
    """Create panel of normals sample set for Firecloud from multiple TSCA batches
    Args:
        paths: (list) paths to file ending in {}.import_samples.txt
        N: (int) number of samples in panel of normals
        name: (string) name of Panel of Normals
    """    
    dfs = [ pd.read_table(paths[0]) ]
    for i, path in enumerate(paths[1:]):
        df_to_concat = pd.read_table(path)
        dfs.append(df_to_concat)
    df = pd.concat(dfs, axis=0)
    # Shuffle samples to pick from all batches
    df = df.sample(frac=1).reset_index(drop=True)
    normals = df[df.sample_type=="Normal"][:N]['sample_id']
    if N==-1: print ("Creating panel of %d normals" %normals.shape[0])
    else: print ("Creating panel of %d normals" %N)
    
    data = pd.concat([pd.DataFrame(index=normals.index, columns=['membership:sample_set_id'], data=name), \
                        normals], axis=1)

    os.system('mkdir -p PoNs')
    filename = './PoNs/fc_upload_PoN_sample_set_tsca_%s.txt' % (name)
    data.to_csv(filename, '\t', index=False)

In [24]:
def write_df_to_csv(data, tsca_id):
    data.to_csv('%s/fc_upload_samples_tsca_%s.txt' % (tsca_id, tsca_id), sep='\t', index=False)

In [25]:
def compile_samples(paths_to_samples_info, google_bucket_id):
    """Compile all samples from all batches
    Args: Self-explanatory
        - paths_to_samples_info: .xlsx file containing paths to files containing sample_info
    Returns: 
        - df with samples from all batches
    """
    paths_to_samples_info = pd.read_excel(paths_to_samples_info, index_col=0)
    df_list = []

    for tsca_id, paths in paths_to_samples_info.iterrows():
        # Make data Firecloud-compatible
        batch_data = batch_samples_for_metadata_import(paths.path_to_samples_info, tsca_id, google_bucket_id)
        df_list.append(batch_data)

    all_samples = pd.concat(df_list, axis=0)
    return all_samples

In [26]:
def add_matching_samples(all_samples, batch_samples):
    """Add sample_id and bam filepath of matching normals and primary tumor tissue for every sample
    Args:
        - all_samples: df with target samples we want to find matches in
        - batch_samples: df with source samples we want to find matches for
    Returns: 
        - batch_samples (augmented)
    """
    for index, row in batch_samples.iterrows():
        # Find all samples from same individual (same individual_id, different sample_id)
        patient_samples = all_samples[ (all_samples['participant_id'] == row['participant_id']) \
                                      & (all_samples['entity:sample_id'] != row['entity:sample_id']) ]

        # NOTE: If more than one match tumor tissue or match normal found, select one at random.
        # The match normal is used to compute allelic fractions in Mutect2, so for now we ignore the conditions it was grown in.

        # Tumor tissue: Add primary tumor tissue
        match_primary_tumor = patient_samples[ patient_samples['external_id_validation'] \
                                              .str.contains('primary|prim|tissue|tiss') ]
        #    > No primary tumor tissue found
        if match_primary_tumor.empty:
            batch_samples.loc[index, 'match_primary_tumor_sample_id'] = "NA"
            batch_samples.loc[index, 'match_primary_tumor_bam_file'] = "NA"
        #    > Tumor tissue found
        elif match_primary_tumor.shape[0] > 0:
            match_primary_tumor = match_primary_tumor.sample(n=1)
            batch_samples.loc[index, 'match_primary_tumor_sample_id'] = match_primary_tumor['entity:sample_id'].item()
            batch_samples.loc[index, 'match_primary_tumor_bam_file'] = match_primary_tumor['clean_bam_file_capture'].item()

        # Add match normal
        match_normal = patient_samples[ patient_samples['sample_type'] == "Normal"]
        #   > No match normal found
        if match_normal.empty: 
            batch_samples.loc[index, 'match_normal_sample_id'] = "NA"
            batch_samples.loc[index, 'match_normal_bam_file'] = "NA"
        #   > Match normal found
        elif match_normal.shape[0] > 0:
            match_normal = match_normal.sample(n=1)
            batch_samples.loc[index, 'match_normal_sample_id'] = match_normal['entity:sample_id'].item()
            batch_samples.loc[index, 'match_normal_bam_file'] = match_normal['clean_bam_file_capture'].item()
            
    return batch_samples

In [27]:
def prepare_all_metadata(tsca_id, path_to_batch_samples_info, path_to_all_samples_info):    
    """Prepare all metadata for uploading to Firecloud
    Args:
        - path_to_batch_samples_info: path to info on batch samples
        - path_to_all_samples_info: path to info on all samples
    """
    patients_for_metadata_import(path_to_batch_samples_info, tsca_id)
    batch_sample_set_for_metadata_import(path_to_batch_samples_info, tsca_id)
    batch_samples = batch_samples_for_metadata_import(path_to_batch_samples_info, tsca_id, google_bucket_id)
    # Collect all samples
    all_samples = compile_samples(path_to_all_samples_info, google_bucket_id)
    # Add match normals and primaries
    batch_samples_with_matches = add_matching_samples(all_samples, batch_samples)
    write_df_to_csv(batch_samples_with_matches, tsca_id)

In [28]:
def export_metadata(tsca_id):
    """Export metadata into Firecloud
    Args: 
        - tsca_id
    """
    patient_metadata    = "%s/fc_upload_patients_tsca_%s.txt" % (tsca_id, tsca_id)
    sample_set_metadata = "%s/fc_upload_sample_set_tsca_%s.txt" % (tsca_id, tsca_id)
    samples_metadata    = "%s/fc_upload_samples_tsca_%s.txt" % (tsca_id, tsca_id)
    pon_metadata        = "PoNs/fc_upload_PoN_sample_set_tsca_%s_PoN.txt" %(tsca_id)

    # Upload metadata
    r1 = upload_entities_from_tsv(namespace, workspace, patient_metadata)
    r2 = upload_entities_from_tsv(namespace, workspace, samples_metadata)
    r3 = upload_entities_from_tsv(namespace, workspace, sample_set_metadata)
    # r4 = upload_entities_from_tsv(namespace, workspace, pon_metadata)
    return (r1, r2, r3)

In [29]:
def create_cumulative_pon(paths, num_normals, pon_name, export_to_fc=False):
    """Create Cumulative PoN
    Args:
        - paths: list of paths to batch info files
        - tsca_ids: list of tsca_ids used in this batch
        - export_to_fc: export to firecloud
    """
    panel_of_normals_for_metadata_import(paths, num_normals, pon_name)
    if export_to_fc:
        return upload_entities_from_tsv(namespace, workspace, 'PoNs/fc_upload_PoN_sample_set_tsca_%s.txt'%pon_name)
    return {}

In [30]:
def get_method_config_version(namespace, workspace, method_namespace, method_config_name):
    res = firecloud_api.get_workspace_config(namespace, workspace, method_namespace, method_config_name)
    return res.json()['methodRepoMethod']['methodVersion']

In [31]:
def update_method_config_versions():
    method_configs = pd.read_table('method_configs/latest_method_configs.txt')
    method_configs['snapshot'] = method_configs['method'].apply(lambda x: get_method_config_version(namespace, workspace, 'tsca', x))
    timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H:%M")
    method_configs.to_csv('method_configs/%s_method_configs.txt'%timestamp, index=False, sep="\t")
    method_configs.to_csv('method_configs/latest_method_configs.txt', index=False, sep="\t")
    return method_configs

In [25]:
def update_wdls():
    """Update WDL scripts in wdls/production directory, to the ones currently being used in Firecloud
    """
    update_method_config_versions()
    method_configs = pd.read_table('method_configs/latest_method_configs.txt')
    for idx, method in method_configs.iterrows():
        res = firecloud_api.get_repository_method('tsca', method.method, method.snapshot)
        print("Updating WDL for %s:%s"%(method.method, method.snapshot))
        if res.status_code == 200:
            text_file = open("../wdls/production/%s.wdl"%method.method, "w")
            text_file.write(res.json()['payload'])
            text_file.close()
    return

In [24]:
# attrs = ["cnv_calls_reduced_segment_mean_img", "cnv_calls_reduced_segment_mean_raw"]
# res = delete_entity_attributes(namespace, workspace, "sample_set", "TSCA19", attrs)

#### Common calls

In [None]:
def update_batch_metadata(tsca_id, path_to_batch_samples_info, path_to_all_samples_info):
    """Upload to Firecloud all the metadata necessary to run TSCA pipeline on new batch
    Args: 
        - path_to_batch_info: ends in *.import_samples.txt
    """
    # Prepare all metadata for batch
    prepare_all_metadata(tsca_id, path_to_batch_samples_info, path_to_all_samples_info)
    export_metadata(batch_id)
    

In [40]:
def main():
    print ("Please add batch_id and path_to_batch_info to paths_to_samples_info.xlsx file")
    s = input('Have you updated the paths_to_samples_info.xlsx file? (Y/N)')
    if s == "N":
        print("Please do so before proceeding...")
        return
    path_to_all_samples_info = "paths_to_samples_info.xlsx"
    batches_info = pd.read_excel(path_to_all_samples_info)
#     for idx, batch in batches_info.iterrows():
#         update_batch_metadata(batch.tscaid, batch.path_to_samples_info, path_to_all_samples_info)

In [39]:
# main()

Please add batch_id and path_to_batch_info to paths_to_samples_info.xlsx file
Have you updated the paths_to_samples_info.xlsx file? (Y/N)N


In [22]:
### Prepare all metadata for batch
# prepare_all_metadata('TSCA20', '/xchip/clf/seq_data/processed_for_fh/tsca20_201707_SN0125362/tsca20_201707_SN0125362.import_samples.txt')

In [23]:
### Create PoN for all batches
# for index, value in paths_to_samples_info.iterrows():
#     paths = [value.path_to_samples_info]
#     pon_id = "%s_PoN" % value.tsca_id
#     panel_of_normals_for_metadata_import(paths, -1, pon_id)

In [40]:
### Export metadata for batch
# r1, r2, r3 = export_metadata('TSCA14')
# for index, value in paths_to_samples_info.iterrows():
#     export_metadata(value.tsca_id)

In [22]:
### Delete sample sets
# for tsca_id in paths_to_samples_info['tsca_id'].tolist():
#     delete_sample_set(namespace, workspace, "%s_PoN" %tsca_id)
# delete_sample_set(namespace, workspace, "CumPon40")

In [29]:
### Deleting a sample
# res = delete_sample_set(namespace, workspace, "TSCA14")
# res = delete_sample(namespace, workspace, 'DW039-Tumor-SM-DB2IF')
### NOTE: Delete sample (manually) from samples fc_upload file AND from sample_set membership fc_upload file

In [None]:
### Create cumulative PoN
# res = create_cumulative_pon(batches_info.path_to_samples_info.tolist(), 5, 'CumPon5', True)

In [31]:
# all_samples = compile_samples('paths_to_samples_info.xlsx', google_bucket_id)

In [37]:
# pon = pd.read_table('PoNs/fc_upload_PoN_sample_set_tsca_CumPon40.txt')

In [100]:
# pon.merge(all_samples, left_on='sample_id', right_on='entity:sample_id')[['sample_id', 'tsca_id']]

In [28]:
# update_method_config_versions()

In [55]:
def download_remote_samples(filename):
    res = firecloud_api.get_entities_tsv(namespace, workspace, "sample")
    with open(filename, 'wb') as f:
        for chunk in res.iter_content(chunk_size=1024): 
            if chunk:
                f.write(chunk)
    return

In [47]:
def update_samples_with_oncotated_mafs():
    download_remote_samples
    res = firecloud_api.get_entities_tsv(namespace, workspace, "sample")
    with open("remote_samples.tsv", 'wb') as f:
        for chunk in res.iter_content(chunk_size=1024): 
            if chunk:
                f.write(chunk)
    remote_samples = pd.read_table("remote_samples.tsv")
    local_samples = compile_samples(path_to_all_samples_info, google_bucket_id)
    udpated_local_samples = pd.merge(local_samples, remote_samples[['entity:sample_id', 'oncotated_maf']], on='entity:sample_id', how='inner')

In [59]:
local_samples.apply(lambda row: print(row), axis=1)

entity:sample_id                                                    AA33-Normal-SM-D4L4D
participant_id                                                                      AA33
clean_bam_file_capture                 gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...
external_id_validation                                                       AA33N_3D_p5
aggregation_product_name_validation                 TSCA Rapid Cancer Detection Panel v2
bsp_sample_id_validation                                                        SM-D59KF
stock_sample_id_validation                                                      SM-D4L4D
sample_type                                                                       Normal
picard_aggregation_type_validation                                                   PCR
processed_subtype_validation                                         Cells:Pellet frozen
source_subtype_validation                                            Tissue:Fresh Tissue
squid_sample_id_valid

entity:sample_id                                                  RCRF003-Tumor-SM-DB2KI
participant_id                                                                   RCRF003
clean_bam_file_capture                 gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...
external_id_validation                                                  RCRF003T_primary
aggregation_product_name_validation                 TSCA Rapid Cancer Detection Panel v2
bsp_sample_id_validation                                                        SM-DB9JJ
stock_sample_id_validation                                                      SM-DB2KI
sample_type                                                                        Tumor
picard_aggregation_type_validation                                                   PCR
processed_subtype_validation                             Tissue:Tissue Lysate/Homogenate
source_subtype_validation                                            Tissue:Fresh Tissue
squid_sample_id_valid

entity:sample_id                                              CCLF_KL1022-Tumor-SM-E7RZK
participant_id                                                               CCLF_KL1022
clean_bam_file_capture                 gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...
external_id_validation                                              CCLF_KL1022T_primary
aggregation_product_name_validation                 TSCA Rapid Cancer Detection Panel v2
bsp_sample_id_validation                                                        SM-E7RZK
stock_sample_id_validation                                                      SM-E77AB
sample_type                                                                        Tumor
picard_aggregation_type_validation                                                   PCR
processed_subtype_validation                                             DNA:DNA Somatic
source_subtype_validation                                            Tissue:Fresh Tissue
squid_sample_id_valid

entity:sample_id                                            CCLF_PEDS1031-Tumor-SM-F67DV
participant_id                                                             CCLF_PEDS1031
clean_bam_file_capture                 gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...
external_id_validation                                              CCLF_PEDS1031T_CM_p8
aggregation_product_name_validation                 TSCA Rapid Cancer Detection Panel v2
bsp_sample_id_validation                                                        SM-F67DV
stock_sample_id_validation                                                      SM-F64NP
sample_type                                                                        Tumor
picard_aggregation_type_validation                                                   PCR
processed_subtype_validation                                             DNA:DNA Somatic
source_subtype_validation                                        Cells:Cell Line, Viable
squid_sample_id_valid

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
      ... 
65    None
66    None
67    None
68    None
69    None
70    None
71    None
72    None
73    None
74    None
75    None
76    None
77    None
78    None
79    None
80    None
81    None
82    None
83    None
84    None
85    None
86    None
87    None
88    None
89    None
90    None
91    None
92    None
93    None
94    None
Length: 950, dtype: object

In [60]:
local_samples.columns

Index(['entity:sample_id', 'participant_id', 'clean_bam_file_capture',
       'external_id_validation', 'aggregation_product_name_validation',
       'bsp_sample_id_validation', 'stock_sample_id_validation', 'sample_type',
       'picard_aggregation_type_validation', 'processed_subtype_validation',
       'source_subtype_validation', 'squid_sample_id_validation',
       'tumor_subtype', 'short_letter_code', 'bam_filename', 'bai_filename',
       'clean_bai_file_capture', 'tsca_id'],
      dtype='object')

In [14]:
remote_samples[ pd.notnull(remote_samples.oncotated_maf) ]

Unnamed: 0,entity:sample_id,aggregation_product_name_validation,bai_filename,bam_filename,bsp_sample_id_validation,clean_bai_file_capture,clean_bam_file_capture,external_id_validation,match_normal_bam_file,match_normal_sample_id,...,__gnomad_vcf_index,__merged_vcfs_index,__mutect2_vcf_index,__scattered_intervals,__partial_mutect2_vcf,__partial_mutect2_vcf_index,filtered_variants,output_directory,clear_snvs,__vcf2_table_unfiltered_variants
32,AA66-Tumor-SM-F29RQ,TSCA Rapid Cancer Detection Panel v2,2_AA66T_OPAC_p4_HKWLGBCXY.2.aligned.duplicates...,2_AA66T_OPAC_p4_HKWLGBCXY.2.aligned.duplicates...,SM-F29RQ,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,AA66T_OPAC_p4,,,...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/9...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/9...,"[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...","[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...",,,,,,
38,AB053-Tumor-SM-F29RR,TSCA Rapid Cancer Detection Panel v2,2_AB053T_OPAC_p8_2D_HKWLGBCXY.2.aligned.duplic...,2_AB053T_OPAC_p8_2D_HKWLGBCXY.2.aligned.duplic...,SM-F29RR,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,AB053T_OPAC_p8_2D,,,...,,,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/0...,"[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...","[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...","[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...",,,,
250,CCLF_PEDS1023-Normal-SM-F29S6,TSCA Rapid Cancer Detection Panel v2,2_CCLF_PEDS1023N_CM_p7_HKWLGBCXY.2.aligned.dup...,2_CCLF_PEDS1023N_CM_p7_HKWLGBCXY.2.aligned.dup...,SM-F29S6,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,CCLF_PEDS1023N_CM_p7,,,...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/8...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/8...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/8...,"[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...","[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...","[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...",gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/d...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,,
251,CCLF_PEDS1023-Tumor-SM-F29S7,TSCA Rapid Cancer Detection Panel v2,2_CCLF_PEDS1023T_RETM_p8_Hypoxia_HKWLGBCXY.2.a...,2_CCLF_PEDS1023T_RETM_p8_Hypoxia_HKWLGBCXY.2.a...,SM-F29S7,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,CCLF_PEDS1023T_RETM_p8_Hypoxia,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/s...,CCLF_PEDS1023-Normal-SM-F29S6,...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/8...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/8...,gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff/8...,"[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...","[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...","[""gs://fc-35446f22-ea37-483a-bd6c-5e9fc56851ff...",,,,


In [34]:
local_samples = compile_samples(path_to_all_samples_info, google_bucket_id)

In [38]:
remote_samples.columns

Index(['entity:sample_id', 'aggregation_product_name_validation',
       'bai_filename', 'bam_filename', 'bsp_sample_id_validation',
       'clean_bai_file_capture', 'clean_bam_file_capture',
       'external_id_validation', 'match_normal_bam_file',
       'match_normal_sample_id', 'match_primary_tumor_bam_file',
       'match_primary_tumor_sample_id', 'participant',
       'picard_aggregation_type_validation', 'processed_subtype_validation',
       'renamed_bam_file', 'sample_type', 'short_letter_code',
       'source_subtype_validation', 'squid_sample_id_validation',
       'stock_sample_id_validation', 'target_coverage', 'tsca_id',
       'tumor_subtype', 'cnv_calls', 'tumor_ptn', 'tumor_seg', 'tumor_tn',
       'annotate_variants_vcf', 'annotate_variants_vcf_index', 'mutect2_vcf',
       'mutect2_vcf_index', 'sample_cum_cov', 'sample_cum_cov_prop',
       'sample_gene_summary', 'sample_interval_statistics',
       'sample_interval_summary', 'sample_statistics', 'sample_summary',
  

In [37]:
local_samples.columns

Index(['entity:sample_id', 'participant_id', 'clean_bam_file_capture',
       'external_id_validation', 'aggregation_product_name_validation',
       'bsp_sample_id_validation', 'stock_sample_id_validation', 'sample_type',
       'picard_aggregation_type_validation', 'processed_subtype_validation',
       'source_subtype_validation', 'squid_sample_id_validation',
       'tumor_subtype', 'short_letter_code', 'bam_filename', 'bai_filename',
       'clean_bai_file_capture', 'tsca_id'],
      dtype='object')

In [51]:
udpated_local_samples = pd.merge(local_samples, remote_samples[['entity:sample_id', 'oncotated_maf']], on='entity:sample_id', how='inner')

In [53]:
udpated_local_samples.to_csv("match_normal_updates/updated_sample_data.tsv", sep="\t", index=None)

In [54]:
res = upload_entities_from_tsv(namespace, workspace, "match_normal_updates/updated_sample_data.tsv")

<Response [200]>