In [1]:
import pandas as pd
import numpy as np
import os
import itertools

In [2]:
from metadata_dict import (dcp_to_tier1_mapping, tier1)

In [3]:
dcp_path = "./dcp_spreadsheets/"
flat_path = "./flat_dcp/"
tier1_path = "./tier1_output/"
file_name = "ImmuneLandscapeccRCC_metadata_30-01-2023.xlsx"

In [4]:
dcp_spreadsheet = pd.read_excel(os.path.join(dcp_path, file_name), sheet_name=None, skiprows= [0,1,2,4])

## Uns tab

In [5]:
project_info = dcp_spreadsheet['Project']
contributors = pd.read_excel(os.path.join(dcp_path, file_name), sheet_name='Project - Contributors', skiprows= [0,1,2,4])


uns = {
    "title": project_info.iloc[0]['project.project_core.project_title'], 
    # select the last listed contributor that is corresponding
    "study_pi": contributors['project.contributors.name']\
                    [max(\
                        contributors[contributors['project.contributors.corresponding_contributor'].isin([True, 'yes'])]\
                            .index)]
    }

In [6]:
uns = pd.DataFrame(uns, index=[0])

## Obs tab

In [7]:
flat_biomaterial = pd.read_csv(os.path.join(flat_path, file_name.replace(".xlsx", "_flat_biomaterial.csv")))

In [8]:
flat_tier1 = flat_biomaterial.rename(columns=dcp_to_tier1_mapping)\
    .drop(columns=[col for col in flat_biomaterial if col not in dcp_to_tier1_mapping.keys()])

In [9]:
if 'sample_source_alive' in flat_tier1.keys():
    flat_tier1['sample_source'] = np.nan
    # fill living with surgical and deceaced with post-mortem, and if there is transplant donor information overwrite
    flat_tier1.loc[(flat_tier1['sample_source_alive'] == 'yes'), 'sample_source'] = 'surgical donor'
    flat_tier1.loc[(flat_tier1['sample_source_alive'] == 'no'), 'sample_source'] = 'postmortem donor'
    
    if  'sample_source_organ' in flat_tier1.keys():
        flat_tier1.loc[(flat_tier1['sample_source_organ'] == 'yes'), 'sample_source'] = 'organ donor'
        flat_tier1.drop(columns=['sample_source_organ'], inplace=True)

  flat_tier1.loc[(flat_tier1['sample_source_alive'] == 'yes'), 'sample_source'] = 'surgical donor'


In [10]:
if 'sex_ontology_term_id' in flat_tier1.keys():
    sex_ontology = {
        'sex_ontology_term_id':
            {
                'female': 'PATO:0000383',
                'male': 'PATO:0000384'
            }           
    }
    flat_tier1.replace(sex_ontology, inplace=True)

In [11]:
def library_to_tissue_type(library_id, dcp_spreadsheet):
    tissue_type_dict = {
        'specimen_from_organism.biomaterial_core.biomaterial_id': 'tissue',
        'cell_line.biomaterial_core.biomaterial_id': 'cell culture',
        'organoid.biomaterial_core.biomaterial_id': 'organoid'
    }
    row = dcp_spreadsheet['Cell suspension'][dcp_spreadsheet['Cell suspension']['cell_suspension.biomaterial_core.biomaterial_id'] == library_id]
    tissue_type = [tissue_type for dcp_type, tissue_type in tissue_type_dict.items() if dcp_type in row.columns and not any(row[dcp_type].isna())]
    if len(tissue_type) > 1:
        raise ValueError(f'Multiple input biomaterials for {library_id}')
    return tissue_type[0]

flat_tier1['tissue_type'] = flat_tier1['library_id'].apply(lambda x: library_to_tissue_type(x, dcp_spreadsheet))

In [12]:
age_to_dev_dict = {
    (0, 14): 'HsapDv:0000264',
    (15, 19): 'HsapDv:0000268',
    (20, 29): 'HsapDv:0000237',
    (30, 39): 'HsapDv:0000238',
    (40, 49): 'HsapDv:0000239',
    (50, 59): 'HsapDv:0000240',
    (60, 69): 'HsapDv:0000241',
    (70, 79): 'HsapDv:0000242',
    (80, 89): 'HsapDv:0000243'
}

def age_to_dev(age, age_to_dev_dict=age_to_dev_dict):
    # TODO add a way to record the following options
    # Unknown = unknown
    # Embryonic stage = A term from the set of Carnegie stages 1-23 = (up to 8 weeks after conception; e.g. HsapDv:0000003)
    # Fetal development = A term from the set of 9 to 38 week post-fertilization human stages = (9 weeks after conception and before birth; e.g. HsapDv:0000046)
    # Post natal =
    if isinstance(age, str) and '-' in age:
        age = [int(age) for age in age.split('-')]
        for age_range, label in age_to_dev_dict.items():
            if age_range[0] <= age[0] <= age_range[1] and \
                    age_range[0] <= age[1] <= age_range[1]:
                return label
            elif age_range[0] <= np.mean(age) <= age_range[1]:
                print(f"Given range {age} overlaps the acceptable ranges.",
                       f"We will use the mean age {np.mean(age)}.")
                return label
    elif isinstance(age, int):
        for age_range, label in age_to_dev_dict.items():
            if age_range[0] <= age <= age_range[1]:
                return label
    print(f"Age {age} could not be mapped to accepted ranges {['-'.join(map(str, age)) for age in age_to_dev_dict.keys()]}")


if flat_tier1['organism_ontology_term_id'].isin([9606]).all():
    flat_tier1['development_stage_ontology_term_id'] = \
        flat_tier1['development_stage_ontology_term_id'].apply(age_to_dev)
else:
    print("Not human organism. Please convert the age to developomental stage ontology manually.")

In [13]:
# For this dataset the preservation and storage method has not been submitted, but it is mentioned in the article that the samples were snap frozen with liquid nitrogen & stored at -80C
if 'sample_preservation_method' not in flat_tier1.keys():
    flat_tier1['sample_preservation_method'] = 'frozen in liquid nitrogen'

In [14]:
# change names to be in align with CELLxGENE schema
suspension_type_dict = {
    'suspension_type':
    {
        'single cell': 'cell',
        'single nucleus': 'nucleus'
    }
}
flat_tier1.replace(suspension_type_dict, inplace=True)

In [15]:
cell_enrichment_dict = {
    'cell_enrichment':
    {
        'DAPI-': 'na',
        np.nan: 'na'
    }
}
flat_tier1.replace(cell_enrichment_dict, inplace=True)

In [16]:
# Almost every dataset we have wrangled is primary data. No way to automate this one as DCP does not record this information
flat_tier1['is_primary_data'] = True

In [17]:
if 'alignment_software' in flat_tier1.keys() and 'alignment_software_version' in flat_tier1.keys():
    flat_tier1['alignment_software'] = flat_tier1['alignment_software'].astype(str) + flat_tier1['alignment_software_version'].astype(str)
else:
    flat_tier1['alignment_software'] = 'Cellranger v3.0.2'
# this field was not in DCP schema when we wrangled this dataset, therefore we add it manually

In [18]:
flat_tier1['organism_ontology_term_id'] = 'NCBITaxon:' + flat_tier1['organism_ontology_term_id'].astype(str)

### Manual inspection

In [19]:
if 'library_id_repository' in flat_tier1.keys():
    print("Are these the library IDs that are listed in the repository? Or just a generic name?")
    display(flat_tier1[['library_id', 'library_id_repository']].drop_duplicates())
if True:
    del flat_tier1['library_id_repository']

Are these the library IDs that are listed in the repository? Or just a generic name?


Unnamed: 0,library_id,library_id_repository
0,IpiNivo_Complete_kidney_CS,Cell suspension from normal kidney tissue
1,IpiNivo_Complete_PBMC_CS,PBMCs
2,NivoExposed_tumor_Near_CS,Cell suspension from tumor tissue (Near)
3,UT1_tumor_Center_CS,Cell suspension from tumor tissue (Center)
4,NivoExposed_tumor_Far_CS,Cell suspension from tumor tissue (Far)
5,UT2_tumor_Near_CS,Cell suspension from tumor tissue (Near)
6,UT2_kidney_CS,Cell suspension from normal kidney tissue
7,IpiNivo_Mixed_kidney_CS,Cell suspension from normal kidney tissue
8,IpiNivo_Mixed_PBMC_CS,PBMCs
9,IpiNivo_Mixed_tumor_Near_CS,Cell suspension from tumor tissue (Near)


In [20]:
# if we have pooled diseases, we would need to select one. by default select the first and print what was not selected
unique_diseases = flat_tier1['disease_ontology_term_id'].str.split("\\|\\|", expand=True, n=1).drop_duplicates().dropna()
if unique_diseases.shape[1] > 1:
    selected_disease = ", ".join(np.unique(unique_diseases[0]))
    unselected_diseases = " and ".join(unique_diseases[1])
    print(f"From pooled diseases, we will use {selected_disease}, instead of {unselected_diseases}")

    flat_tier1['disease_ontology_term_id'] = flat_tier1['disease_ontology_term_id'].str.split("\\|\\|").str[0]

if False:
    flat_tier1.loc[(flat_tier1['disease_ontology_term_id'] == 'POOLED DISEASES'), 'disease_ontology_term_id'] = 'SELECTED DISEASE'

From pooled diseases, we will use MONDO:0005005, instead of HP:0002716


In [21]:
# diseased donor and healthy specimen does not mean adjacent every time. This needs to be inspected manually
flat_tier1['sampled_site_condition'] = np.nan
flat_tier1.loc[(flat_tier1['sampled_site_condition_donor'] == 'normal') & \
               (flat_tier1['sampled_site_condition_specimen'] == 'normal'), 'sampled_site_condition'] = 'healthy'
flat_tier1.loc[(flat_tier1['sampled_site_condition_donor'] != 'normal') & \
               (flat_tier1['sampled_site_condition_specimen'] == 'normal'), 'sampled_site_condition'] = 'adjacent'
flat_tier1.loc[(flat_tier1['sampled_site_condition_specimen'] != 'normal'), 'sampled_site_condition'] = 'diseased'

if any(flat_tier1['sampled_site_condition'] == 'adjacent'):
    print("Please investigate if diseases of donor could be diseases in specimen,",
         "in order to define healthy or adjacent sampled_site_condition")
    display(flat_tier1.loc[flat_tier1['sampled_site_condition'] == 'adjacent', \
                         ['sampled_site_condition_donor', 'sampled_site_condition_specimen', 'tissue']])

if False:
    flat_tier1.loc[(flat_tier1['sampled_site_condition_donor'] == 'TYPE NOT ADJACENT DISEASE') & \
                   (flat_tier1['sampled_site_condition_specimen'] == 'normal'), 'sampled_site_condition'] = 'healthy'

Please investigate if diseases of donor could be diseases in specimen, in order to define healthy or adjacent sampled_site_condition


  flat_tier1.loc[(flat_tier1['sampled_site_condition_donor'] == 'normal') & \


Unnamed: 0,sampled_site_condition_donor,sampled_site_condition_specimen,tissue
0,clear cell renal carcinoma,normal,kidney
1,clear cell renal carcinoma,normal,blood
6,clear cell renal carcinoma,normal,kidney
7,clear cell renal carcinoma,normal,kidney
8,clear cell renal carcinoma,normal,blood
15,clear cell renal carcinoma,normal,kidney
18,clear cell renal carcinoma,normal,kidney
23,clear cell renal carcinoma||Lymphadenopathy,normal,kidney
24,clear cell renal carcinoma||Lymphadenopathy,normal,blood
27,clear cell renal carcinoma,normal,blood


In [22]:
if 'manner_of_death' not in flat_tier1.keys():
    flat_tier1['manner_of_death'] = 'unknown'
flat_tier1['manner_of_death'] = flat_tier1['manner_of_death'].apply(str)

if 'sample_source_alive' in flat_tier1.keys():
    flat_tier1.loc[(flat_tier1['sample_source_alive'] == 'yes'), 'manner_of_death'] = 'not applicable'
if 'manner_of_death_string' in flat_tier1.keys() and \
    not flat_tier1['manner_of_death'].isin(['0', '1', '2', '3', '4', 'unknown']).any():
    print("We have the following cause of death values. Could we convert that in hardy scale?")
    display(flat_tier1.loc[(pd.notna(flat_tier1['manner_of_death_string']),'manner_of_death_string')].unique())

if False:
    flat_tier1.loc[(flat_tier1['manner_of_death_string'] == 'SELECTED CAUSE OF DEATH'), 'manner_of_death'] = 'HARDY SCALE VALUE'

In [23]:
# after inspection drop redundant columns
flat_tier1.drop(columns=['sampled_site_condition_donor', 'sampled_site_condition_specimen', 
                         'tissue', 'sample_source_alive'], inplace=True)

In [24]:
# TODO add library_sequencing_run and library_sequencing_run_insdc

## Group by sample

In [25]:
# Template is at sample level, therefore all library level fields will be collapsed with comma
def collapse_values(series):
    return ", ".join(series.unique().astype(str))

flat_tier1_sample = flat_tier1.groupby('sample_id').agg(collapse_values)

# keep only tier 1 fields, remove helper fields
flat_tier1_sample = flat_tier1_sample.reindex(columns=[col for col in tier1['obs'] if col != 'sample_id'])

## Export to xlsx

In [26]:
tier1_output = tier1_path + file_name.replace('.xlsx', "_Tier1.xlsx")
! cp 'HCA_Tier 1_ Technical Metadata template_v0.1.xlsx' $tier1_output

In [27]:
tier1_spreadsheet = {}
tier1_spreadsheet['Tier 1_uns'] = uns
tier1_spreadsheet['Tier 1_obs'] = flat_tier1_sample

In [28]:
excel_dict = pd.read_excel(tier1_output, header=1, sheet_name=None)
for sheet_name, excel_df in excel_dict.items():
    if sheet_name in tier1_spreadsheet:
        py_df = tier1_spreadsheet[sheet_name]
        excel_dict[sheet_name] = pd.concat([excel_df, py_df])

In [29]:
with pd.ExcelWriter(tier1_output) as writer:
    for tab in excel_dict:
        excel_dict[tab].to_excel(writer, sheet_name=tab, index=False)

In [30]:
tier1_spreadsheet['Tier 1_uns'].to_csv(os.path.join(tier1_path, file_name.replace('.xlsx', '_tier1_uns.csv')))
tier1_spreadsheet['Tier 1_obs'].to_csv(os.path.join(tier1_path, file_name.replace('.xlsx', '_tier1_obs.csv')))