In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from metadata_dict import (hca_keys, dcp_to_tier1_mapping)

Please specify the `file_name` of the dcp spreadsheet, that exists in the `dcp_path`

In [3]:
dcp_path = "./dcp_spreadsheets/"
flat_path = "./flat_dcp/"
tier1_path = "./tier1_output/"
file_name = "ImmuneLandscapeccRCC_metadata_30-01-2023.xlsx"

In [4]:
spreadsheet = pd.read_excel(os.path.join(dcp_path, file_name), sheet_name=None, skiprows= [0,1,2,4])

In [6]:
# Save all biomaterials and protocols from each tab as join keys

join_keys = {}
for sheet in spreadsheet:
    join_keys[sheet] = []
    for key in spreadsheet[sheet].keys():
        if (key.endswith("biomaterial_id") or key.endswith("protocol_id")) and \
                (not all(spreadsheet[sheet][key].isna())):
            join_keys[sheet].append(key)
    if not join_keys[sheet]:
        del join_keys[sheet]


join_set = set()
for keys in join_keys.values():
    join_set.update(keys)

In [7]:
# Save keys that have double pipe in order to merge metadata together for those keys

keys_with_double_pipe = []
for sheet in join_keys.keys():
    for keys in join_keys[sheet]:
        if any(spreadsheet[sheet][keys].str.contains("\\|\\|")):
            keys_with_double_pipe.append(keys)
keys_with_double_pipe

['enrichment_protocol.protocol_core.protocol_id',
 'dissociation_protocol.protocol_core.protocol_id',
 'cell_suspension.biomaterial_core.biomaterial_id']

In [8]:
# Automatic ordering of spreadsheet tabs from analysis file to donor
# default ordering for simple experimental design & protocols
# ordered_sheet = ['Analysis file', 'Cell suspension', 'Specimen from organism', 'Donor organism', 
#                  'Analysis protocol', 'Sequencing protocol', 'Library preparation protocol', 'Dissociation protocol', 'Enrichment protocol', 'Collection protocol']

def field_id_to_tab(value):
    """Convert the programmatic name of a field to their entity/ tab name"""
    return value.split('.')[0].replace('_', ' ').capitalize()


ordered_sheets = []
if 'Analysis file' in join_keys.keys():
    tab = 'Analysis file'
    ordered_sheets.append('Analysis file')
    if 'Sequence file' in join_keys.keys():
        ordered_sheets.append('Sequence file')
else:
    tab = 'Sequence file'

while True:
    if tab == 'Donor organism':
        break
    for key in join_keys[tab]:
        if field_id_to_tab(key) in ordered_sheets:
            continue
        if key.endswith('protocol_id'):
            ordered_sheets.append(field_id_to_tab(key))
        elif key.endswith('biomaterial_id'):
            tab = field_id_to_tab(key)
            ordered_sheets.append(tab)

ordered_sheets

['Analysis file',
 'Sequence file',
 'Sequencing protocol',
 'Analysis protocol',
 'Library preparation protocol',
 'Cell suspension',
 'Enrichment protocol',
 'Dissociation protocol',
 'Specimen from organism',
 'Collection protocol',
 'Donor organism']

In [9]:
# filter out analysis files that will create duplicates i.e. barcodes and features for same CS and remove analysis_files that did not derive from CS
spreadsheet['Analysis file'] = spreadsheet['Analysis file'][spreadsheet['Analysis file']['analysis_file.file_core.content_description.ontology']\
        .isin(['data:3917', 'data:3112'])]\
    .dropna(subset='cell_suspension.biomaterial_core.biomaterial_id')

# If we have pooled CS for analysis_files, create a new entry for each CS of each file
analysis_pooledCS = spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id'].str.contains("\\|\\|")
if any(analysis_pooledCS):
    print(f"analysis files are pooled")
    spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id'] = \
        spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id']\
            .str.split('\\|\\|')\
            .apply(lambda x: x[0] if len(x) == 1 else x)
    spreadsheet['Analysis file'] = spreadsheet['Analysis file'].explode('cell_suspension.biomaterial_core.biomaterial_id')

# Initiate flat dataframe with analysis_files and unique cell_suspensions
bc = spreadsheet['Analysis file'][['cell_suspension.biomaterial_core.biomaterial_id','analysis_file.file_core.file_name']]\
    .drop_duplicates('cell_suspension.biomaterial_core.biomaterial_id')

analysis files are pooled


In [10]:
# edit specific process field names to their Tier 1 field since there are multiple process fields in each sheet

process_fields = {
    'Specimen from organism': {'process.process_core.location': 'sample_collection_site'},
    'Sequence file': {'process.process_core.process_id': 'library_sequencing_run'}
}

for sheet in process_fields:
    if sheet not in spreadsheet:
        continue
    spreadsheet[sheet] = spreadsheet[sheet].rename(columns=process_fields[sheet])


# Sequence files

In [11]:
# Remove files from library_preparation_protocol that has to do with modality that will not be included in the count_matrix
if 'Sequence file' in spreadsheet:
    print(f"Select the library preparations that you want to include {spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].unique()}\n")
    if True:
        # Exclude this library_preparation_protocol <SELECT HERE>
        exclude_lib_prep = ['TCR_library_prep']
        spreadsheet['Sequence file'] = spreadsheet['Sequence file'][~spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].isin(exclude_lib_prep)]
    if False:
        # Include this library_preparation_protocol <OR SELECT HERE>
        include_lib_prep = ['10x_library_prep']
        spreadsheet['Sequence file'] = spreadsheet['Sequence file'][spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].isin(include_lib_prep)]


Select the library preparations that you want to include ['10x_library_prep' 'TCR_library_prep']



In [12]:
# If we have insdc run accessions, and custom process_id includes this accession, then use insdc run accession instead of process_id
if 'Sequence file' in spreadsheet and \
    'sequence_file.insdc_run_accessions' in spreadsheet['Sequence file'] and \
        spreadsheet['Sequence file']['sequence_file.insdc_run_accessions'].notna().any():
    insdc_in_process = spreadsheet['Sequence file'].apply(lambda row: row['sequence_file.insdc_run_accessions'] in row['library_sequencing_run'], axis=1)
    spreadsheet['Sequence file'].loc[insdc_in_process, 'library_sequencing_run'] = spreadsheet['Sequence file'].loc[insdc_in_process, 'sequence_file.insdc_run_accessions']

In [13]:
def collapse_values(series):
    return ", ".join(series.unique().astype(str))

# If no analysis files available provide flatten metatadata at the sample level based on the CS from the sequence file tab

if 'Analysis file' not in spreadsheet:
    bc = spreadsheet['Sequence file']\
        .drop(columns=[col for col in spreadsheet['Sequence file'] if col != 'cell_suspension.biomaterial_core.biomaterial_id'])\
        .drop_duplicates()

if 'Sequence file' in spreadsheet:
    # We don't want to record the read_index
    if 'sequence_file.read_index' in spreadsheet['Sequence file'] and \
        spreadsheet['Sequence file']['sequence_file.read_index'].notna().any():
        spreadsheet['Sequence file'].drop(\
            spreadsheet['Sequence file'][spreadsheet['Sequence file']['sequence_file.read_index'] != 'read1'].index\
                )
    # From sequence file we record everything at the CS level
    spreadsheet['Sequence file'] = spreadsheet['Sequence file']\
        .groupby('cell_suspension.biomaterial_core.biomaterial_id')\
        .agg(collapse_values)\
        .reset_index()


## Merge

In [14]:
# if we have pooled keys, edit the spreadsheet and add the pooled values as extra rows

def process_pooled_keys(sheet, bc, join_key):
    pooled_keys = [key for key in bc[join_key].dropna().unique().tolist() if "||" in key]
    print("We have pooled join keys", pooled_keys)
    for key in pooled_keys:
        unpooled_bool = sheet[join_key].isin(key.split("||"))
        summary_row = {}
        unique_values = {}
        # df.loc[unpooled_bool] = df.loc[unpooled_bool].astype('string')
        # unique_values = df.loc[unpooled_bool].apply(lambda col: col.unique()).to_dict()
        # unique_values = df.loc[unpooled_bool].apply(lambda col: col.unique()).to_dict('list')
        # unique_values = {col : list(contents.dropna().astype('str').unique()) for col, contents in df.loc[unpooled_bool].items()}
        for col, contents in sheet.loc[unpooled_bool].items():
            unique_values[col] = list(contents.dropna().astype('str').unique())
            if len(unique_values[col]) == 1:
                unique_values[col] = unique_values[col][0]
        for col, values in unique_values.items():
            # if 'nan' in values and len(values) > 1:
            #     unique_values[col] = values[values != 'nan']
            if isinstance(values, list) and len(values) > 1:
                summary_row[col] = '||'.join(values)
            else:
                summary_row[col] = values
        # summary_row = {col: '||'.join(values) if len(values) > 1 else values[0]
        #     for col, values in unique_values.items()}
        id_key = [name for name in summary_row.keys() if name.endswith("_id")]
        if len(id_key) > 1:
            print("Multiple id keys")
        summary_row[id_key[0]] = key
        return pd.concat([sheet, pd.DataFrame([summary_row])], ignore_index=True)



In [15]:
# in each sheet merge the flat df with the spreadsheet

for sheet in ordered_sheets:
    if sheet not in spreadsheet.keys():
        continue
    keys_in_tab = spreadsheet[sheet].keys()
    join_key = [key for key in keys_in_tab if key in bc.keys() and key in join_set]
    if len(join_key) > 1:
        print("Multiple join keys in " + sheet + ":\n\t" + "\n\t".join(join_key))
        if sheet in ['Analysis file', 'Sequence file']:
            join_key = 'cell_suspension.biomaterial_core.biomaterial_id'
    else:
        join_key = join_key[0]

    keys_in_tab = [key for key in keys_in_tab if key not in bc.keys() or key == join_key]
    if join_key in keys_with_double_pipe and any(bc[join_key].str.contains("\\|\\|")):
        spreadsheet[sheet] = process_pooled_keys(spreadsheet[sheet], bc, join_key)

    print("Merging sheet " + sheet + " on key " + join_key, sep = "\t")
    bc = bc.merge(spreadsheet[sheet][keys_in_tab], on = join_key, how = "left")
    print("Shape is " + str(bc.shape))
bc

Merging sheet Analysis file on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (29, 19)
Multiple join keys in Sequence file:
	cell_suspension.biomaterial_core.biomaterial_id
	sequencing_protocol.protocol_core.protocol_id
	library_preparation_protocol.protocol_core.protocol_id
Merging sheet Sequence file on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (29, 32)
Merging sheet Sequencing protocol on key sequencing_protocol.protocol_core.protocol_id
Shape is (29, 43)
Merging sheet Analysis protocol on key analysis_protocol.protocol_core.protocol_id
Shape is (29, 50)
Merging sheet Library preparation protocol on key library_preparation_protocol.protocol_core.protocol_id
Shape is (29, 71)
Merging sheet Cell suspension on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (29, 90)
We have pooled join keys ['live_cell_selection||size_selection']
Merging sheet Enrichment protocol on key enrichment_protocol.protocol_core.protocol_id
Shape is (29, 97)
We have

Unnamed: 0,cell_suspension.biomaterial_core.biomaterial_id,analysis_file.file_core.file_name,analysis_file.uuid,analysis_file.file_core.format,analysis_file.file_core.content_description.text,analysis_file.file_core.content_description.ontology,analysis_file.file_core.content_description.ontology_label,analysis_file.file_core.file_source,analysis_file.matrix_cell_count,analysis_file.genome_assembly_version,...,donor_organism.organism_age_unit.ontology,donor_organism.organism_age_unit.ontology_label,donor_organism.development_stage.text,donor_organism.development_stage.ontology,donor_organism.development_stage.ontology_label,donor_organism.diseases.text,donor_organism.diseases.ontology,donor_organism.diseases.ontology_label,donor_organism.medical_history.treatment,donor_organism.medical_history.medication
0,IpiNivo_Complete_kidney_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient was placed on ipilimumab and nivolumab...,"Ipilumab, Nivolumab"
1,IpiNivo_Complete_PBMC_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient was placed on ipilimumab and nivolumab...,"Ipilumab, Nivolumab"
2,NivoExposed_tumor_Near_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient was enrolled in a clinical trial of ne...,Nivolumab
3,UT1_tumor_Center_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient underwent radical nephrectomy and was ...,
4,NivoExposed_tumor_Far_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient was enrolled in a clinical trial of ne...,Nivolumab
5,UT2_tumor_Near_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient underwent a cytoreductive nephrectomy ...,
6,UT2_kidney_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient underwent a cytoreductive nephrectomy ...,
7,IpiNivo_Mixed_kidney_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient had a near complete response in the lu...,"Ipilumab, Nivolumab"
8,IpiNivo_Mixed_PBMC_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient had a near complete response in the lu...,"Ipilumab, Nivolumab"
9,IpiNivo_Mixed_tumor_Near_CS,ccRCC_6pat_Seurat.rds,943cffb4-e61a-4858-9b9c-f9fcdcb7ac0f,rds,count matrix,data:3917,Count matrix,Publication,167283.0,GRCh37,...,UO:0000036,year,adult human stage,HsapDv:0000087,human adult stage,Clear cell renal cell carcinoma,MONDO:0005005,clear cell renal carcinoma,Patient had a near complete response in the lu...,"Ipilumab, Nivolumab"


In [17]:
bc.to_csv(os.path.join(flat_path, file_name.replace(".xlsx", "_flat_biomaterial.csv")), index = False)