In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from metadata_dict import (hca_keys, dcp_to_tier1_mapping)

In [3]:
bionetwork = "pancreas/"
dcp_path = "./dcp_spreadsheets/" + bionetwork
flat_path = "./flat_dcp/" + bionetwork
tier1_path = "./tier1_output/" + bionetwork
file_name = "ImmuneLandscapeccRCC_metadata_30-01-2023.xlsx"

In [4]:
import ipywidgets as widgets
from IPython.display import display
file_name_dropdown = widgets.Dropdown(
    options= os.listdir(path=dcp_path),
    # value=file_name,
    description='Report Entity:',
    disabled=False,
)
display(file_name_dropdown)

Dropdown(description='Report Entity:', options=('Healthy_and_type_2_diabetes_pancreas.xlsx', 'pancreasNormalIs…

In [5]:
file_name = file_name_dropdown.value
print(file_name)

Healthy_and_type_2_diabetes_pancreas.xlsx


In [6]:
spreadsheet = pd.read_excel(os.path.join(dcp_path, file_name), sheet_name=None, skiprows= [0,1,2,4])

## Programmatic names as join keys

In [7]:
# Save all biomaterials and protocols from each tab as join keys

join_keys = {}
for sheet in spreadsheet:
    join_keys[sheet] = []
    for key in spreadsheet[sheet].keys():
        if (key.endswith("biomaterial_id") or key.endswith("protocol_id")) and \
                (not all(spreadsheet[sheet][key].isna())):
            join_keys[sheet].append(key)
    if not join_keys[sheet]:
        del join_keys[sheet]


join_set = set()
for keys in join_keys.values():
    join_set.update(keys)

In [8]:
# Save keys that have double pipe in order to merge metadata together for those keys

keys_with_double_pipe = []
for sheet in join_keys.keys():
    for keys in join_keys[sheet]:
        if any(spreadsheet[sheet][keys].dropna().str.contains("\\|\\|")):
            print(keys, sheet)
            keys_with_double_pipe.append(keys)
keys_with_double_pipe

[]

## Spreadsheet tab order

In [9]:
# default ordering for simple experimental design & protocols
# ordered_sheets = ['Analysis file', 'Cell suspension', 'Specimen from organism', 'Donor organism', 
#                  'Analysis protocol', 'Sequencing protocol', 'Library preparation protocol', 'Dissociation protocol', 'Enrichment protocol', 'Collection protocol']

def field_id_to_tab(value):
    """Convert the programmatic name of a field to their entity/ tab name"""
    return value.split('.')[0].replace('_', ' ').capitalize()


ordered_sheets = []
if 'Analysis file' in join_keys.keys():
    tab = 'Analysis file'
    ordered_sheets.append('Analysis file')
    if 'Sequence file' in join_keys.keys():
        ordered_sheets.append('Sequence file')
else:
    tab = 'Sequence file'
    ordered_sheets.append('Sequence file')


while len(ordered_sheets) < len(join_keys):
    if tab == 'Donor organism':
        break
    for key in join_keys[tab]:
        if field_id_to_tab(key) in ordered_sheets:
            continue
        if key.endswith('protocol_id'):
            ordered_sheets.append(field_id_to_tab(key))
        elif key.endswith('biomaterial_id'):
            tab = field_id_to_tab(key)
            ordered_sheets.append(tab)

ordered_sheets

['Sequence file',
 'Sequencing protocol',
 'Library preparation protocol',
 'Cell suspension',
 'Dissociation protocol',
 'Specimen from organism',
 'Donor organism']

## Analysis file

In [10]:
if 'Analysis file' in spreadsheet:
    # filter out analysis files that will create duplicates i.e. barcodes and features for same CS and remove analysis_files that did not derive from CS
    spreadsheet['Analysis file'] = spreadsheet['Analysis file'][\
        spreadsheet['Analysis file']['analysis_file.file_core.content_description.ontology'].str.contains('data:3917|data:3112|data:2082',na=False)]\
        .dropna(subset='cell_suspension.biomaterial_core.biomaterial_id')
    
    # If we have pooled CS for analysis_files, create a new entry for each CS of each file
    analysis_pooledCS = spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id'].str.contains("\\|\\|")
    if any(analysis_pooledCS):
        print(f"analysis files are pooled")
        spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id'] = \
            spreadsheet['Analysis file']['cell_suspension.biomaterial_core.biomaterial_id']\
                .str.split('\\|\\|')\
                .apply(lambda x: x[0] if len(x) == 1 else x)
        spreadsheet['Analysis file'] = spreadsheet['Analysis file'].explode('cell_suspension.biomaterial_core.biomaterial_id')

In [11]:
# edit specific process field names to their Tier 1 field since there are multiple process fields in each sheet

process_fields = {
    'Specimen from organism': {'process.process_core.location': 'sample_collection_site'},
    'Sequence file': {'process.process_core.process_id': 'library_sequencing_run',
                      'process.insdc_experiment.insdc_experiment_accession': 'library_id_repository'}
}

for sheet in process_fields:
    if sheet not in spreadsheet:
        continue
    spreadsheet[sheet] = spreadsheet[sheet].rename(columns=process_fields[sheet])


# Sequence files

In [12]:
def collapse_values(series):
    return ", ".join(series.unique().astype(str))
    
# If we have insdc run accessions, and custom process_id includes this accession, then use insdc run accession instead of process_id
if 'Sequence file' in spreadsheet and \
    'sequence_file.insdc_run_accessions' in spreadsheet['Sequence file'] and \
        spreadsheet['Sequence file']['sequence_file.insdc_run_accessions'].notna().any():
    insdc_in_process = spreadsheet['Sequence file'].apply(lambda row: row['sequence_file.insdc_run_accessions'] in row['library_sequencing_run'], axis=1)
    spreadsheet['Sequence file'].loc[insdc_in_process, 'library_sequencing_run'] = spreadsheet['Sequence file'].loc[insdc_in_process, 'sequence_file.insdc_run_accessions']

# Remove files from library_preparation_protocol that has to do with modality that will not be included in the count_matrix
if 'Sequence file' in spreadsheet:
    print(f"Select the library preparations that you want to include {spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].unique()}\n")
    if True:
        # Exclude this library_preparation_protocol <SELECT HERE>
        exclude_lib_prep = ['TCR_library_prep']
        spreadsheet['Sequence file'] = spreadsheet['Sequence file'][~spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].isin(exclude_lib_prep)]
    if False:
        # Include this library_preparation_protocol <OR SELECT HERE>
        include_lib_prep = ['10x_library_prep']
        spreadsheet['Sequence file'] = spreadsheet['Sequence file'][spreadsheet['Sequence file']['library_preparation_protocol.protocol_core.protocol_id'].isin(include_lib_prep)]

    # From sequence file we record everything at the CS level
    spreadsheet['Sequence file'] = spreadsheet['Sequence file']\
        .groupby('cell_suspension.biomaterial_core.biomaterial_id')\
        .agg(collapse_values)\
        .reset_index()


Select the library preparations that you want to include ['library_preparation_protocol_1']



## Initialise output dataframe

In [13]:
# Initiate flat dataframe with analysis_files and unique cell_suspensions (bc for barcode)
if 'Analysis file' in spreadsheet:
    bc = spreadsheet['Analysis file'][['cell_suspension.biomaterial_core.biomaterial_id','analysis_file.file_core.file_name']]\
        .drop_duplicates('cell_suspension.biomaterial_core.biomaterial_id')

# If no analysis files available provide flatten metatadata at the sample level based on the CS from the sequence file tab
# TODO unpool CS for sequence files too
elif 'Sequence file' in spreadsheet:
    bc = spreadsheet['Sequence file'].loc[:, ['cell_suspension.biomaterial_core.biomaterial_id']]\
        .drop_duplicates()

## Merge

In [14]:
# if we have pooled keys, edit the spreadsheet and add the pooled values as extra rows

def process_pooled_keys(sheet, bc, join_key):
    pooled_keys = [key for key in bc[join_key].dropna().unique().tolist() if "||" in key]
    print("We have pooled join keys", pooled_keys)
    for key in pooled_keys:
        unpooled_bool = sheet[join_key].isin(key.split("||"))
        summary_row = {}
        unique_values = {}
        for col, contents in sheet.loc[unpooled_bool].items():
            unique_values[col] = list(contents.dropna().astype('str').unique())
            if len(unique_values[col]) == 1:
                unique_values[col] = unique_values[col][0]
        for col, values in unique_values.items():
            if isinstance(values, list) and len(values) > 1:
                summary_row[col] = '||'.join(values)
            else:
                summary_row[col] = values
        id_key = [name for name in summary_row.keys() if name.endswith("_id")]
        if len(id_key) > 1:
            print(f"Multiple id keys {', '.join(id_key)}")
        summary_row[id_key[0]] = key
        return pd.concat([sheet, pd.DataFrame([summary_row])], ignore_index=True)



In [15]:
# in each sheet merge the flat df with the spreadsheet

for sheet in ordered_sheets:
    if sheet not in spreadsheet.keys():
        continue
    keys_in_tab = spreadsheet[sheet].keys()
    join_key = [key for key in keys_in_tab if key in bc.keys() and key in join_set]
    if len(join_key) > 1:
        print("Multiple join keys in " + sheet + ":\n\t" + "\n\t".join(join_key))
        if sheet in ['Analysis file', 'Sequence file']:
            join_key = 'cell_suspension.biomaterial_core.biomaterial_id'
    else:
        join_key = join_key[0]

    keys_in_tab = [key for key in keys_in_tab if key not in bc.keys() or key == join_key]
    if join_key in keys_with_double_pipe and any(bc[join_key].str.contains("\\|\\|")):
        spreadsheet[sheet] = process_pooled_keys(spreadsheet[sheet], bc, join_key)

    print("Merging sheet " + sheet + " on key " + join_key, sep = "\t")
    bc = bc.merge(spreadsheet[sheet][keys_in_tab], on = join_key, how = "left")
    print("Shape is " + str(bc.shape))
bc

Merging sheet Sequence file on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (3514, 15)
Merging sheet Sequencing protocol on key sequencing_protocol.protocol_core.protocol_id
Shape is (3514, 25)
Merging sheet Library preparation protocol on key library_preparation_protocol.protocol_core.protocol_id
Shape is (3514, 41)
Merging sheet Cell suspension on key cell_suspension.biomaterial_core.biomaterial_id
Shape is (3514, 58)
Merging sheet Dissociation protocol on key dissociation_protocol.protocol_core.protocol_id
Shape is (3514, 64)
Merging sheet Specimen from organism on key specimen_from_organism.biomaterial_core.biomaterial_id
Shape is (3514, 79)
Merging sheet Donor organism on key donor_organism.biomaterial_core.biomaterial_id
Shape is (3514, 99)


Unnamed: 0,cell_suspension.biomaterial_core.biomaterial_id,sequence_file.uuid,sequence_file.file_core.file_name,sequence_file.file_core.format,sequence_file.read_index,sequence_file.read_length,sequence_file.insdc_run_accessions,process.uuid,library_id_repository,library_sequencing_run,...,donor_organism.diseases.ontology,donor_organism.diseases.ontology_label,donor_organism.development_stage.text,donor_organism.development_stage.ontology,donor_organism.development_stage.ontology_label,donor_organism.organism_age,donor_organism.organism_age_unit.text,donor_organism.organism_age_unit.ontology,donor_organism.organism_age_unit.ontology_label,donor_organism.human_specific.body_mass_index
0,AZ_A1,cd3734cc-be6e-4ca5-b8d7-895be2a8d250,AZ_A1.fastq.gz,fastq.gz,read1,43,ERR1630013,6b38f344-d3c4-42f1-bf4a-636780738b1b,ERX1700346,process_id_5637,...,PATO:0000461,normal,adult,HsapDv:0000087,human adult stage,43,year,UO:0000036,year,30.8
1,AZ_A10,bb3c4911-f5e4-4407-a925-06b0f8a5170b,AZ_A10.fastq.gz,fastq.gz,read1,43,ERR1630014,2fcd9719-76ba-4201-99f5-81b6845b0d21,ERX1700347,process_id_4905,...,PATO:0000461,normal,adult,HsapDv:0000087,human adult stage,43,year,UO:0000036,year,30.8
2,AZ_A11,7968b49c-c91e-4fc7-97ec-8471448855b8,AZ_A11.fastq.gz,fastq.gz,read1,43,ERR1630015,da577196-83b3-48fd-ab94-de95c6a799d9,ERX1700348,process_id_3710,...,PATO:0000461,normal,adult,HsapDv:0000087,human adult stage,43,year,UO:0000036,year,30.8
3,AZ_A12,118a91d4-62b2-416e-9db2-27c1d755f616,AZ_A12.fastq.gz,fastq.gz,read1,43,ERR1630016,65d4e84e-1f60-4268-b359-aef79e8cfda8,ERX1700349,process_id_4906,...,PATO:0000461,normal,adult,HsapDv:0000087,human adult stage,43,year,UO:0000036,year,30.8
4,AZ_A2,43e879b7-1039-4da4-bb92-7fa08e617066,AZ_A2.fastq.gz,fastq.gz,read1,43,ERR1630017,eca2f4b3-d38c-44d0-a6ed-390e55e9e07e,ERX1700350,process_id_5428,...,PATO:0000461,normal,adult,HsapDv:0000087,human adult stage,43,year,UO:0000036,year,30.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3509,HP1526901T2D_P5,fba4b772-f9ee-4f75-b99a-45dfa349ecae,HP1526901T2D_P5.fastq.gz,fastq.gz,read1,43,ERR1633522,18a831b1-400a-4bd2-937b-182138cb7704,ERX1703855,process_id_6939,...,MONDO:0005148,type 2 diabetes mellitus,adult,HsapDv:0000087,human adult stage,55,year,UO:0000036,year,29.8
3510,HP1526901T2D_P6,692c950c-5429-4661-b7dc-86bf03bcfa8e,HP1526901T2D_P6.fastq.gz,fastq.gz,read1,43,ERR1633523,4fd648c5-d1b5-42dd-a4ee-5aaabe0f6390,ERX1703856,process_id_6940,...,MONDO:0005148,type 2 diabetes mellitus,adult,HsapDv:0000087,human adult stage,55,year,UO:0000036,year,29.8
3511,HP1526901T2D_P7,837e3e59-78b4-44b0-9c88-1364ad0241cd,HP1526901T2D_P7.fastq.gz,fastq.gz,read1,43,ERR1633524,442be1b0-c0b1-4c7b-b2fb-c9172467898e,ERX1703857,process_id_4864,...,MONDO:0005148,type 2 diabetes mellitus,adult,HsapDv:0000087,human adult stage,55,year,UO:0000036,year,29.8
3512,HP1526901T2D_P8,b768ced2-363f-4dd1-98da-115237eec156,HP1526901T2D_P8.fastq.gz,fastq.gz,read1,43,ERR1633525,f10bd522-753b-4747-b99d-2bbab330eb3a,ERX1703858,process_id_6941,...,MONDO:0005148,type 2 diabetes mellitus,adult,HsapDv:0000087,human adult stage,55,year,UO:0000036,year,29.8


In [16]:
bc.to_csv(os.path.join(flat_path, file_name.replace(".xlsx", "_flat_biomaterial.csv")), index = False)