In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from metadata_dict import (hca_keys, dcp_to_tier1_mapping)

Please specify the `file_name` of the dcp spreadsheet, that exists in the `dcp_path`

In [3]:
dcp_path = "./dcp_spreadsheets/"
flat_path = "./flat_dcp/"
tier1_path = "./tier1_output/"
file_name = "ImmuneLandscapeccRCC_metadata_30-01-2023.xlsx"

In [4]:
spreadsheet = pd.read_excel(os.path.join(dcp_path, file_name), sheet_name=None, skiprows= [0,1,2,4])

In [5]:
# Save all biomaterials and protocols from each tab as join keys

join_keys = {}
join_set = set()
for sheet in spreadsheet:
    join_keys[sheet] = []
    for key in spreadsheet[sheet].keys():
        if (key.endswith("biomaterial_id") or key.endswith("protocol_id")) and (not all(spreadsheet[sheet][key].isna())):
            join_keys[sheet].append(key)
            join_set.add(key)
    if not join_keys[sheet]:
        del join_keys[sheet]

In [6]:
# Save keys that have double pipe in order to merge metadata together for those keys

keys_with_double_pipe = []
for sheet in join_keys.keys():
    for keys in join_keys[sheet]:
        if any(spreadsheet[sheet][keys].str.contains("\\|\\|")):
            keys_with_double_pipe.append(keys)
keys_with_double_pipe

['enrichment_protocol.protocol_core.protocol_id',
 'dissociation_protocol.protocol_core.protocol_id',
 'cell_suspension.biomaterial_core.biomaterial_id']

In [7]:
# Manually order the spreadsheet tabs in the `ordered_sheet` list, in order to trace back the experimental design
# Most common experiemntal design is Analysis file -> CS -> Specimen -> Donor

list(join_keys.values())
# default ordering for simple experimental design & protocols
ordered_sheet = ['Analysis file', 'Cell suspension', 'Specimen from organism', 'Donor organism', 
                 'Analysis protocol', 'Sequencing protocol', 'Library preparation protocol', 'Dissociation protocol', 'Enrichment protocol', 'Collection protocol']

In [8]:
# filter out analysis files that will create duplicates i.e. barcodes and features for same CS and remove analysis_files that did not derive from CS
spreadsheet['Analysis file'] = spreadsheet['Analysis file'][spreadsheet['Analysis file']['analysis_file.file_core.content_description.ontology']\
        .isin(['data:3917', 'data:3112'])]\
    .dropna(subset='cell_suspension.biomaterial_core.biomaterial_id')

# If we have pooled CS for analysis_files, create a new entry for each CS of each file
analysis_pooledCS = spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id'].str.contains("\\|\\|")
if any(analysis_pooledCS):
    print(f"analysis files are pooled")
    spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id'] = \
        spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id']\
            .str.split('\\|\\|')\
            .apply(lambda x: x[0] if len(x) == 1 else x)
    spreadsheet['Analysis file'] = spreadsheet['Analysis file'].explode('cell_suspension.biomaterial_core.biomaterial_id')

# Initiate barcode df with analysis_files and unique cell_suspensions
bc = spreadsheet['Analysis file'][['cell_suspension.biomaterial_core.biomaterial_id','analysis_file.file_core.file_name']]\
    .drop_duplicates('cell_suspension.biomaterial_core.biomaterial_id')

analysis files are pooled


In [9]:
# edit specific process field names to capture since there are multiple process fields in each sheet

process_fields = {
    'Specimen from organism': {'process.process_core.location': 'sample_collection_site'},
    'Sequence file': {'process.process_core.process_id': 'library_sequencing_run'}
}

for sheet in process_fields:
    if sheet not in spreadsheet:
        continue
    spreadsheet[sheet] = spreadsheet[sheet].rename(columns=process_fields[sheet])

In [10]:
# If no analysis files available provide flatten metatadata at the sample level based on the CS from the sequence file tab
# create the backbone of the flatten metadata from the sequence file tab, removing all sequence file specific fields and removing duplicate entries 
if 'Analysis file' not in spreadsheet.keys():
    seq_specific_fields = ['sequence_file.file_core.file_name', 'sequence_file.file_core.format', 
                       'sequence_file.insdc_run_accessions', 'sequence_file.file_core.content_description.text', 
                       'sequence_file.file_core.content_description.ontology', 
                       'sequence_file.file_core.content_description.ontology_label', 'sequence_file.read_index',
                       'sequence_file.uuid', 'sequence_file.file_core.file_source', 'sequence_file.read_length', 
                       'process.uuid', 'process.insdc_experiment.insdc_experiment_accession', 'process.process_core.process_id', 
                       'sequencing_protocol.protocol_core.protocol_id', 'sequencing_protocol.uuid', 
                       'library_preparation_protocol.protocol_core.protocol_id', 'library_preparation_protocol.uuid', 
                       'sequence_file.lane_index']
    bc = spreadsheet['Sequence file'].dropna(axis=1, how= "all").drop(columns=seq_specific_fields).drop_duplicates()


In [11]:
# if we have pooled keys, edit the spreadsheet and add the pooled values as extra rows

def process_pooled_keys(sheet, bc, join_key):
    pooled_keys = [key for key in bc[join_key].dropna().unique().tolist() if "||" in key]
    print("We have pooled join keys", pooled_keys)
    for key in pooled_keys:
        unpooled_bool = sheet[join_key].isin(key.split("||"))
        summary_row = {}
        unique_values = {}
        # df.loc[unpooled_bool] = df.loc[unpooled_bool].astype('string')
        # unique_values = df.loc[unpooled_bool].apply(lambda col: col.unique()).to_dict()
        # unique_values = df.loc[unpooled_bool].apply(lambda col: col.unique()).to_dict('list')
        # unique_values = {col : list(contents.dropna().astype('str').unique()) for col, contents in df.loc[unpooled_bool].items()}
        for col, contents in sheet.loc[unpooled_bool].items():
            unique_values[col] = list(contents.dropna().astype('str').unique())
            if len(unique_values[col]) == 1:
                unique_values[col] = unique_values[col][0]
        for col, values in unique_values.items():
            # if 'nan' in values and len(values) > 1:
            #     unique_values[col] = values[values != 'nan']
            if isinstance(values, list) and len(values) > 1:
                summary_row[col] = '||'.join(values)
            else:
                summary_row[col] = values
        # summary_row = {col: '||'.join(values) if len(values) > 1 else values[0]
        #     for col, values in unique_values.items()}
        id_key = [name for name in summary_row.keys() if name.endswith("_id")]
        if len(id_key) > 1:
            print("Multiple id keys")
        summary_row[id_key[0]] = key
        return pd.concat([sheet, pd.DataFrame([summary_row])], ignore_index=True)


In [12]:
# in each sheet merge the flat df with the spreadsheet

for sheet in ordered_sheet:
    if sheet not in spreadsheet.keys():
        continue
    keys_in_tab = spreadsheet[sheet].keys()
    join_key = [key for key in keys_in_tab if key in bc.keys() and key in join_set]
    if len(join_key) > 1:
        print("Multiple join keys in " + sheet + ":\n\t" + "\n\t".join(join_key))
        if 'analysis_file.file_core.file_name' in join_key:
            join_key = 'analysis_file.file_core.file_name'
    else:
        join_key = join_key[0]

    keys_in_tab = [key for key in keys_in_tab if key not in bc.keys() or key == join_key]
    if join_key in keys_with_double_pipe and any(bc[join_key].str.contains("\\|\\|")):
        spreadsheet[sheet] = process_pooled_keys(spreadsheet[sheet], bc, join_key)

    print("Merging sheet " + sheet + " on key " + join_key, sep = "\t")
    bc = bc.merge(spreadsheet[sheet][keys_in_tab], on = join_key, how = "left")
    print("Shape is " + str(bc.shape))
bc

Merging sheet Analysis file on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (29, 19)
Merging sheet Cell suspension on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (29, 38)
Merging sheet Specimen from organism on key specimen_from_organism.biomaterial_core.biomaterial_id
Shape is (29, 55)
Merging sheet Donor organism on key donor_organism.biomaterial_core.biomaterial_id
Shape is (29, 75)
Merging sheet Analysis protocol on key analysis_protocol.protocol_core.protocol_id
Shape is (29, 82)
Merging sheet Sequencing protocol on key sequencing_protocol.protocol_core.protocol_id
Shape is (29, 93)
Merging sheet Library preparation protocol on key library_preparation_protocol.protocol_core.protocol_id
Shape is (29, 114)
We have pooled join keys ['enzymatic_dissociation||mechanical_dissociation']
Merging sheet Dissociation protocol on key dissociation_protocol.protocol_core.protocol_id
Shape is (29, 119)
We have pooled join keys ['live_cell_selection||size_selectio

Unnamed: 0,cell_suspension.biomaterial_core.biomaterial_id,analysis_file.file_core.file_name,analysis_file.uuid,analysis_file.file_core.format,analysis_file.file_core.content_description.text,analysis_file.file_core.content_description.ontology,analysis_file.file_core.content_description.ontology_label,analysis_file.file_core.file_source,analysis_file.matrix_cell_count,analysis_file.genome_assembly_version,...,enrichment_protocol.method.text,enrichment_protocol.method.ontology,enrichment_protocol.method.ontology_label,enrichment_protocol.maximum_size,enrichment_protocol.markers,collection_protocol.protocol_core.protocol_name,collection_protocol.protocol_core.protocol_description,collection_protocol.method.text,collection_protocol.method.ontology,collection_protocol.method.ontology_label
0,IpiNivo_Complete_kidney_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,cell size selection||FACS,EFO:0009337||EFO:0009108,cell size selection||fluorescence-activated ce...,100.0,DAPI-,Kidney specimen collection,Matching blood was collected in CPT tubes befo...,surgical resection,EFO:0009744,surgical resection
1,IpiNivo_Complete_PBMC_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,density gradient centrifugation,EFO:0009112,density gradient centrifugation,,,Blood draw,Samples were directly obtained from the operat...,blood draw,EFO:0009121,blood draw
2,NivoExposed_tumor_Near_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,cell size selection||FACS,EFO:0009337||EFO:0009108,cell size selection||fluorescence-activated ce...,100.0,DAPI-,Kidney specimen collection,Matching blood was collected in CPT tubes befo...,surgical resection,EFO:0009744,surgical resection
3,UT1_tumor_Center_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,cell size selection||FACS,EFO:0009337||EFO:0009108,cell size selection||fluorescence-activated ce...,100.0,DAPI-,Kidney specimen collection,Matching blood was collected in CPT tubes befo...,surgical resection,EFO:0009744,surgical resection
4,NivoExposed_tumor_Far_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,cell size selection||FACS,EFO:0009337||EFO:0009108,cell size selection||fluorescence-activated ce...,100.0,DAPI-,Kidney specimen collection,Matching blood was collected in CPT tubes befo...,surgical resection,EFO:0009744,surgical resection
5,UT2_tumor_Near_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,cell size selection||FACS,EFO:0009337||EFO:0009108,cell size selection||fluorescence-activated ce...,100.0,DAPI-,Kidney specimen collection,Matching blood was collected in CPT tubes befo...,surgical resection,EFO:0009744,surgical resection
6,UT2_kidney_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,cell size selection||FACS,EFO:0009337||EFO:0009108,cell size selection||fluorescence-activated ce...,100.0,DAPI-,Kidney specimen collection,Matching blood was collected in CPT tubes befo...,surgical resection,EFO:0009744,surgical resection
7,IpiNivo_Mixed_kidney_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,cell size selection||FACS,EFO:0009337||EFO:0009108,cell size selection||fluorescence-activated ce...,100.0,DAPI-,Kidney specimen collection,Matching blood was collected in CPT tubes befo...,surgical resection,EFO:0009744,surgical resection
8,IpiNivo_Mixed_PBMC_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,density gradient centrifugation,EFO:0009112,density gradient centrifugation,,,Blood draw,Samples were directly obtained from the operat...,blood draw,EFO:0009121,blood draw
9,IpiNivo_Mixed_tumor_Near_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,cell size selection||FACS,EFO:0009337||EFO:0009108,cell size selection||fluorescence-activated ce...,100.0,DAPI-,Kidney specimen collection,Matching blood was collected in CPT tubes befo...,surgical resection,EFO:0009744,surgical resection


In [13]:
bc = bc.drop_duplicates(subset='cell_suspension.biomaterial_core.biomaterial_id')

In [14]:
bc.to_csv(os.path.join(flat_path, file_name.replace(".xlsx", "_flat_biomaterial.csv")), index = False)