In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from metadata_dict import (hca_keys, dcp_to_tier1_mapping)

In [3]:
dcp_path = "./dcp_spreadsheets/"
flat_path = "./flat_dcp/"
tier1_path = "./tier1_output/"
file_name = "ImmuneLandscapeccRCC_metadata_30-01-2023.xlsx"

In [4]:
spreadsheet = pd.read_excel(os.path.join(dcp_path, file_name), sheet_name=None, skiprows= [0,1,2,4])

## Programmatic names as join keys

In [5]:
# Save all biomaterials and protocols from each tab as join keys

join_keys = {}
for sheet in spreadsheet:
    join_keys[sheet] = []
    for key in spreadsheet[sheet].keys():
        if (key.endswith("biomaterial_id") or key.endswith("protocol_id")) and \
                (not all(spreadsheet[sheet][key].isna())):
            join_keys[sheet].append(key)
    if not join_keys[sheet]:
        del join_keys[sheet]


join_set = set()
for keys in join_keys.values():
    join_set.update(keys)

In [6]:
# Save keys that have double pipe in order to merge metadata together for those keys

keys_with_double_pipe = []
for sheet in join_keys.keys():
    for keys in join_keys[sheet]:
        if any(spreadsheet[sheet][keys].dropna().str.contains("\\|\\|")):
            print(keys, sheet)
            keys_with_double_pipe.append(keys)
keys_with_double_pipe

cell_suspension.biomaterial_core.biomaterial_id Analysis file


['cell_suspension.biomaterial_core.biomaterial_id']

## Spreadsheet tab order

In [7]:
# default ordering for simple experimental design & protocols
# ordered_sheets = ['Analysis file', 'Cell suspension', 'Specimen from organism', 'Donor organism', 
#                  'Analysis protocol', 'Sequencing protocol', 'Library preparation protocol', 'Dissociation protocol', 'Enrichment protocol', 'Collection protocol']

def field_id_to_tab(value):
    """Convert the programmatic name of a field to their entity/ tab name"""
    return value.split('.')[0].replace('_', ' ').capitalize()


ordered_sheets = []
if 'Analysis file' in join_keys.keys():
    tab = 'Analysis file'
    ordered_sheets.append('Analysis file')
    if 'Sequence file' in join_keys.keys():
        ordered_sheets.append('Sequence file')
else:
    tab = 'Sequence file'
    ordered_sheets.append('Sequence file')


while True:
    if tab == 'Donor organism':
        break
    for key in join_keys[tab]:
        if field_id_to_tab(key) in ordered_sheets:
            continue
        if key.endswith('protocol_id'):
            ordered_sheets.append(field_id_to_tab(key))
        elif key.endswith('biomaterial_id'):
            tab = field_id_to_tab(key)
            ordered_sheets.append(tab)

ordered_sheets

['Analysis file',
 'Sequencing protocol',
 'Library preparation protocol',
 'Analysis protocol',
 'Cell suspension',
 'Dissociation protocol',
 'Enrichment protocol',
 'Specimen from organism',
 'Collection protocol',
 'Donor organism']

## Analysis file

In [8]:
if 'Analysis file' in spreadsheet:
    # filter out analysis files that will create duplicates i.e. barcodes and features for same CS and remove analysis_files that did not derive from CS
    spreadsheet['Analysis file'] = spreadsheet['Analysis file'][spreadsheet['Analysis file']['analysis_file.file_core.content_description.ontology']\
            .isin(['data:3917', 'data:3112'])]\
        .dropna(subset='cell_suspension.biomaterial_core.biomaterial_id')

    # If we have pooled CS for analysis_files, create a new entry for each CS of each file
    analysis_pooledCS = spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id'].str.contains("\\|\\|")
    if any(analysis_pooledCS):
        print(f"analysis files are pooled")
        spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id'] = \
            spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id']\
                .str.split('\\|\\|')\
                .apply(lambda x: x[0] if len(x) == 1 else x)
        spreadsheet['Analysis file'] = spreadsheet['Analysis file'].explode('cell_suspension.biomaterial_core.biomaterial_id')

In [9]:
# edit specific process field names to their Tier 1 field since there are multiple process fields in each sheet

process_fields = {
    'Specimen from organism': {'process.process_core.location': 'sample_collection_site'},
    'Sequence file': {'process.process_core.process_id': 'library_sequencing_run',
                      'process.insdc_experiment.insdc_experiment_accession': 'library_id_repository'}
}

for sheet in process_fields:
    if sheet not in spreadsheet:
        continue
    spreadsheet[sheet] = spreadsheet[sheet].rename(columns=process_fields[sheet])


# Sequence files

In [10]:
def collapse_values(series):
    return ", ".join(series.unique().astype(str))
    
# If we have insdc run accessions, and custom process_id includes this accession, then use insdc run accession instead of process_id
if 'Sequence file' in spreadsheet and \
    'sequence_file.insdc_run_accessions' in spreadsheet['Sequence file'] and \
        spreadsheet['Sequence file']['sequence_file.insdc_run_accessions'].notna().any():
    insdc_in_process = spreadsheet['Sequence file'].apply(lambda row: row['sequence_file.insdc_run_accessions'] in row['library_sequencing_run'], axis=1)
    spreadsheet['Sequence file'].loc[insdc_in_process, 'library_sequencing_run'] = spreadsheet['Sequence file'].loc[insdc_in_process, 'sequence_file.insdc_run_accessions']

# Remove files from library_preparation_protocol that has to do with modality that will not be included in the count_matrix
if 'Sequence file' in spreadsheet:
    print(f"Select the library preparations that you want to include {spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].unique()}\n")
    if False:
        # Exclude this library_preparation_protocol <SELECT HERE>
        exclude_lib_prep = ['TCR_library_prep']
        spreadsheet['Sequence file'] = spreadsheet['Sequence file'][~spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].isin(exclude_lib_prep)]
    if False:
        # Include this library_preparation_protocol <OR SELECT HERE>
        include_lib_prep = ['10x_library_prep']
        spreadsheet['Sequence file'] = spreadsheet['Sequence file'][spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].isin(include_lib_prep)]

    # From sequence file we record everything at the CS level
    spreadsheet['Sequence file'] = spreadsheet['Sequence file']\
        .groupby('cell_suspension.biomaterial_core.biomaterial_id')\
        .agg(collapse_values)\
        .reset_index()


## Initialise output dataframe

In [11]:
# Initiate flat dataframe with analysis_files and unique cell_suspensions (bc for barcode)
if 'Analysis file' in spreadsheet:
    bc = spreadsheet['Analysis file'][['cell_suspension.biomaterial_core.biomaterial_id','analysis_file.file_core.file_name']]\
        .drop_duplicates('cell_suspension.biomaterial_core.biomaterial_id')

# If no analysis files available provide flatten metatadata at the sample level based on the CS from the sequence file tab
# TODO unpool CS for sequence files too
elif 'Sequence file' in spreadsheet:
    bc = spreadsheet['Sequence file'].loc[:, ['cell_suspension.biomaterial_core.biomaterial_id']]\
        .drop_duplicates()

## Merge

In [12]:
# if we have pooled keys, edit the spreadsheet and add the pooled values as extra rows

def process_pooled_keys(sheet, bc, join_key):
    pooled_keys = [key for key in bc[join_key].dropna().unique().tolist() if "||" in key]
    print("We have pooled join keys", pooled_keys)
    for key in pooled_keys:
        unpooled_bool = sheet[join_key].isin(key.split("||"))
        summary_row = {}
        unique_values = {}
        for col, contents in sheet.loc[unpooled_bool].items():
            unique_values[col] = list(contents.dropna().astype('str').unique())
            if len(unique_values[col]) == 1:
                unique_values[col] = unique_values[col][0]
        for col, values in unique_values.items():
            if isinstance(values, list) and len(values) > 1:
                summary_row[col] = '||'.join(values)
            else:
                summary_row[col] = values
        id_key = [name for name in summary_row.keys() if name.endswith("_id")]
        if len(id_key) > 1:
            print("Multiple id keys")
        summary_row[id_key[0]] = key
        return pd.concat([sheet, pd.DataFrame([summary_row])], ignore_index=True)



In [13]:
# in each sheet merge the flat df with the spreadsheet

for sheet in ordered_sheets:
    if sheet not in spreadsheet.keys():
        continue
    keys_in_tab = spreadsheet[sheet].keys()
    join_key = [key for key in keys_in_tab if key in bc.keys() and key in join_set]
    if len(join_key) > 1:
        print("Multiple join keys in " + sheet + ":\n\t" + "\n\t".join(join_key))
        if sheet in ['Analysis file', 'Sequence file']:
            join_key = 'cell_suspension.biomaterial_core.biomaterial_id'
    else:
        join_key = join_key[0]

    keys_in_tab = [key for key in keys_in_tab if key not in bc.keys() or key == join_key]
    if join_key in keys_with_double_pipe and any(bc[join_key].str.contains("\\|\\|")):
        spreadsheet[sheet] = process_pooled_keys(spreadsheet[sheet], bc, join_key)

    print("Merging sheet " + sheet + " on key " + join_key, sep = "\t")
    bc = bc.merge(spreadsheet[sheet][keys_in_tab], on = join_key, how = "left")
    print("Shape is " + str(bc.shape))
bc

Merging sheet Analysis file on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (14, 19)
Merging sheet Sequencing protocol on key sequencing_protocol.protocol_core.protocol_id
Shape is (14, 30)
Merging sheet Library preparation protocol on key library_preparation_protocol.protocol_core.protocol_id
Shape is (14, 51)
Merging sheet Analysis protocol on key analysis_protocol.protocol_core.protocol_id
Shape is (14, 58)
Merging sheet Cell suspension on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (14, 74)
Merging sheet Dissociation protocol on key dissociation_protocol.protocol_core.protocol_id
Shape is (14, 79)
Merging sheet Enrichment protocol on key enrichment_protocol.protocol_core.protocol_id
Shape is (14, 85)
Merging sheet Specimen from organism on key specimen_from_organism.biomaterial_core.biomaterial_id
Shape is (14, 108)
Merging sheet Collection protocol on key collection_protocol.protocol_core.protocol_id
Shape is (14, 113)
Merging sheet Donor organism 

Unnamed: 0,cell_suspension.biomaterial_core.biomaterial_id,analysis_file.file_core.file_name,analysis_file.uuid,analysis_file.file_core.format,analysis_file.file_core.content_description.text,analysis_file.file_core.content_description.ontology,analysis_file.file_core.content_description.ontology_label,analysis_file.file_core.file_source,analysis_file.matrix_cell_count,analysis_file.genome_assembly_version,...,donor_organism.organism_age,donor_organism.organism_age_unit.text,donor_organism.organism_age_unit.ontology,donor_organism.organism_age_unit.ontology_label,donor_organism.development_stage.text,donor_organism.development_stage.ontology,donor_organism.development_stage.ontology_label,donor_organism.diseases.text,donor_organism.diseases.ontology,donor_organism.diseases.ontology_label
0,SI_18854,GSM4819725_SI_18854_filtered_gene_bc_matrices_...,25d981eb-c5e9-48a7-bd66-b00d140459c8,h5,cell count matrix,data:3917,Count matrix,GEO,1704.0,GRCh38,...,71,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma
1,SI_18856,GSM4819726_SI_18856_filtered_gene_bc_matrices_...,bc678d2c-d723-4e85-b3ae-f3ee697b851c,h5,cell count matrix,data:3917,Count matrix,GEO,1410.0,GRCh38,...,70,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma
2,SI_18855,GSM4819727_SI_18855_filtered_gene_bc_matrices_...,4ce78e13-cc84-40c2-96fe-603b4cc627c7,h5,cell count matrix,data:3917,Count matrix,GEO,1660.0,GRCh38,...,70,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma
3,SI_19704,GSM4819728_SI_19704_filtered_gene_bc_matrices_...,7cc14036-f934-4b8a-80e3-9fb632d68a02,h5,cell count matrix,data:3917,Count matrix,GEO,923.0,GRCh38,...,69,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma
4,SI_19703,GSM4819729_SI_19703_filtered_gene_bc_matrices_...,d5cce50a-a6df-4e5d-b25e-3ead78676a49,h5,cell count matrix,data:3917,Count matrix,GEO,3674.0,GRCh38,...,69,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma
5,SI_21255,GSM4819730_SI_21255_filtered_gene_bc_matrices_...,9b9f11b4-12ad-49e5-8697-ec04e8efa20b,h5,cell count matrix,data:3917,Count matrix,GEO,1097.0,GRCh38,...,65,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma
6,SI_21256,GSM4819731_SI_21256_filtered_gene_bc_matrices_...,8dbffaf6-aea4-4637-b717-9be34884adfc,h5,cell count matrix,data:3917,Count matrix,GEO,578.0,GRCh38,...,65,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma
7,SI_21561,GSM4819732_SI_21561_filtered_gene_bc_matrices_...,3b3cfc6d-09c9-44de-8f0e-a0f65d381a25,h5,cell count matrix,data:3917,Count matrix,GEO,2853.0,GRCh38,...,43,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Chromophobe RCC,MONDO:0017885,chromophobe renal cell carcinoma
8,SI_22369,GSM4819733_SI_22369_filtered_gene_bc_matrices_...,3452e082-8b6e-4ff7-aff7-a29a77907e39,h5,cell count matrix,data:3917,Count matrix,GEO,1644.0,GRCh38,...,76,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma
9,SI_22368,GSM4819734_SI_22368_filtered_gene_bc_matrices_...,cba3be17-2e75-4991-a154-0b61e2d3c482,h5,cell count matrix,data:3917,Count matrix,GEO,2872.0,GRCh38,...,76,year,UO:0000036,year,human adult,HsapDv:0000087,human adult stage,Clear Cell RCC,MONDO:0005005,clear cell renal carcinoma


In [14]:
bc.to_csv(os.path.join(flat_path, file_name.replace(".xlsx", "_flat_biomaterial.csv")), index = False)