In [1]:
import os
import io
import requests

import pandas as pd
from numpy import nan

from tier1_to_dcp_dict import tier1_to_dcp

In [2]:
collection_id = 'bcb61471-2a44-4d00-a0af-ff085512674c'
dataset_id = '0b75c598-0893-4216-afe8-5414cab7739d'

In [3]:
study_metadata = pd.read_csv(f"metadata/{collection_id}_{dataset_id}_study_metadata.csv", header=None).T
study_metadata.columns = study_metadata.iloc[0]
study_metadata.drop(0, axis=0, inplace=True)
sample_metadata = pd.read_csv(f"metadata/{collection_id}_{dataset_id}_metadata.csv")

In [4]:
sample_metadata = pd.read_csv('example/ImmuneLandscapeccRCC_metadata_30-01-2023_tier1_obs.csv')

In [5]:
hca_template_url = 'https://github.com/ebi-ait/geo_to_hca/raw/master/template/hca_template.xlsx'
dcp_spreadsheet = pd.read_excel(hca_template_url, sheet_name=None, skiprows= [0,1,2,4])

# save the 4-row header of the original spreadsheet with programmatic name as column names
dcp_headers = pd.read_excel(hca_template_url, sheet_name=None, header=None)
for tab in dcp_headers:
    dcp_headers[tab].rename(columns=dcp_headers[tab].iloc[3], inplace= True)

In [6]:
if 'doi' in sample_metadata and len(set(sample_metadata['doi'])) == 1:
    dcp_spreadsheet['Project - Publications'] = pd.DataFrame({key: \
        (study_metadata['doi'].tolist() if key.endswith("doi") \
            else [nan]) \
            for key in dcp_spreadsheet['Project - Publications'].keys()})

if 'institute' in sample_metadata:
    # TODO add institute per sample
    if len(set(sample_metadata['institute'])) == 1:
        dcp_spreadsheet['Cell suspension']['process.process_core.location'] = sample_metadata['institute'][0]
if 'title' in sample_metadata:
    if len(set(sample_metadata['title'])) != 1:
        print(f"We have multiple titles {set(sample_metadata['title'])}")
    dcp_spreadsheet['Project'] = pd.DataFrame({key: \
    (sample_metadata['title'][0] if key.endswith("project_title") \
        else [nan]) \
        for key in dcp_spreadsheet['Project'].keys()})
if 'study_pi' in  sample_metadata.columns and \
    'institute' in sample_metadata:
    # TODO add fix for multiple institutes per sample
    if len(set(sample_metadata['study_pi'])) == 1 and \
        len(set(sample_metadata['institute'])) == 1:
        study_pi_dict = {
            'project.contributors.name': sample_metadata['study_pi'][0], 
            'project.contributors.institution': sample_metadata['institute'][0],
            'project.contributors.corresponding_contributor': 'yes'
            }
        study_pi_dict.update({
            key: nan for key in dcp_spreadsheet['Project - Contributors'].keys() if key not in study_pi_dict.keys()
        })
        dcp_spreadsheet['Project - Contributors'] = pd.DataFrame(study_pi_dict, index=[0])


In [7]:
if 'sample_collection_relative_time_point' in sample_metadata:
    number_pattern = '([\\d]+[.|\\,]?\\d?)'
    sample_metadata['specimen_from_organism.biomaterial_core.timecourse.value'] = \
        sample_metadata['sample_collection_relative_time_point'].str.extract(number_pattern, expand=False)
    sample_metadata.loc[sample_metadata['sample_collection_relative_time_point'].notna(), 'specimen_from_organism.biomaterial_core.timecourse.relevance'] = 'relative time of collection'
    time_units_pattern = r'(hour|day|week|month|year)'
    sample_metadata['specimen_from_organism.biomaterial_core.timecourse.unit.text'] = \
        sample_metadata['sample_collection_relative_time_point'].str.extract(time_units_pattern, expand=False)

In [8]:
if 'organism_ontology_term_id' in sample_metadata:
    sample_metadata['donor_organism.biomaterial_core.ncbi_taxon_id'] = sample_metadata['organism_ontology_term_id'].str.removeprefix('NCBITaxon:')

In [9]:
sex_ontology_dict = {
    'donor_organism.sex':
        {
            'PATO:0000383': 'female',
            'PATO:0000384': 'male'
        }           
}
if 'sex_ontology_term_id' in sample_metadata:
    sample_metadata['donor_organism.sex'] = sample_metadata['sex_ontology_term_id']
    sample_metadata.replace(sex_ontology_dict, inplace=True)

In [10]:
hardy_scale = [0, 1, 2, 3, 4, '0', '1', '2', '3', '4']
manner_of_death_is_living_dict = {n: 'no' for n in hardy_scale}
manner_of_death_is_living_dict.update({'unknown': 'no', 'not applicable': 'yes'})
manner_of_death_is_living_dict = {'donor_organism.is_living': manner_of_death_is_living_dict}

if 'manner_of_death' in sample_metadata:
    sample_metadata['donor_organism.death.hardy_scale'] = sample_metadata.apply(lambda x: x['manner_of_death'] if x['manner_of_death'] in hardy_scale else nan, axis=1)
    sample_metadata['donor_organism.is_living'] = sample_metadata['manner_of_death']
    sample_metadata.replace(manner_of_death_is_living_dict, inplace=True)

In [11]:
if 'sample_source' in sample_metadata:
    sample_metadata['specimen_from_organism.transplant_organ'] = sample_metadata.apply(lambda x: 'yes' if x['sample_source'] == 'organ_donor' else 'no', axis=1)
    if any((sample_metadata['sample_source'] == 'postmortem donor') & (sample_metadata['manner_of_death'] == 'not applicable')) or \
       any((sample_metadata['sample_source'] != 'postmortem donor') & (sample_metadata['manner_of_death'] != 'not applicable')):
        print(f'Conflicting metadata {sample_metadata.loc[(sample_metadata['sample_source'] == 'postmortem donor') & (sample_metadata['manner_of_death'] == 'not applicable'), ['sample_source', 'manner_of_death']]}')
        print(f'Conflicting metadata {sample_metadata.loc[(sample_metadata['sample_source'] != 'postmortem donor') & (sample_metadata['manner_of_death'] != 'not applicable'), ['sample_source', 'manner_of_death']]}')

In [12]:
dcp_flat = sample_metadata.rename(columns=tier1_to_dcp)

In [13]:
for tab in dcp_spreadsheet:
    keys_union = [key for key in dcp_spreadsheet[tab].keys() if key in dcp_flat.keys()]
    # if tab contains only the input biomaterial name, then skip the tab
    if (len(keys_union) == 1) and (tab.lower().replace(" ", "_") != keys_union[0].split(".")[0]):
        continue
    # collapse arrays in duplicated columns
    if any(dcp_flat[keys_union].columns.duplicated()):
        for dub_cols in set(dcp_flat[keys_union].columns[dcp_flat[keys_union].columns.duplicated()]):
            df = dcp_flat[dub_cols]
            dcp_flat.drop(columns=dub_cols, inplace=True)
            dcp_flat[dub_cols] = df[dub_cols].apply(lambda x: '||'.join(x.dropna().astype(str)),axis=1)

    # merge the two dataframes
    dcp_spreadsheet[tab] = pd.concat([dcp_spreadsheet[tab],dcp_flat[keys_union]])
    dcp_spreadsheet[tab] = dcp_spreadsheet[tab].dropna(how='all').drop_duplicates()

    # generate a unique protocol_id
    if tab.endswith('protocol') and keys_union:
        dcp_spreadsheet[tab] = dcp_spreadsheet[tab].drop_duplicates()
        # there should be only 1 protocol_id in each protocol tab. we need a series to replace spaces
        protocol_id_col = [col for col in dcp_spreadsheet[tab].columns if col.endswith('protocol_core.protocol_id')][0]
        dcp_spreadsheet[tab][protocol_id_col] = [tab.lower().replace(" ","_") + "_" + str(n + 1) for n in range(len(dcp_spreadsheet[tab]))]

    if tab == 'Project':
        dcp_spreadsheet[tab] = dcp_spreadsheet[tab].drop_duplicates()


In [14]:
with pd.ExcelWriter(f"metadata/{collection_id}_{dataset_id}_dcp.xlsx") as writer:
    for tab in dcp_spreadsheet:
        if not dcp_spreadsheet[tab].empty:
            print(tab)
            pd.concat([dcp_headers[tab], dcp_spreadsheet[tab]]).to_excel(writer, sheet_name=tab, header=False)

Project
Project - Contributors
Donor organism
Specimen from organism
Cell suspension
Sequence file
Collection protocol
Enrichment protocol
Library preparation protocol
Sequencing protocol
Analysis file
Analysis protocol
