In [1]:
import pandas as pd
import numpy as np
import os
import itertools

In [2]:
from metadata_dict import dcp_to_tier1_mapping

In [3]:
path = "."
file_name = "ImmuneLandscapeccRCC_metadata_30-01-2023.xlsx"

In [4]:
dcp_spreadsheet = pd.read_excel(path + "/dcp_spreadsheets/" + file_name, sheet_name=None, skiprows= [0,1,2,4])

## Uns tab

In [5]:
project_info = dcp_spreadsheet['Project']
contributors = pd.read_excel(path + "/dcp_spreadsheets/" + file_name, sheet_name='Project - Contributors', skiprows= [0,1,2,4])


uns = {
    "title": project_info.iloc[0]['project.project_core.project_title'], 
    "study_pi": contributors['project.contributors.name']\
                    [max(\
                        contributors[contributors['project.contributors.corresponding_contributor'] == True]\
                            .index)]
    }

In [6]:
uns = pd.DataFrame(uns, index=[0])

## Obs tab

In [7]:
# flat_biomaterial = pd.read_csv(path + "/flat_dcp/"  + file_name.replace(".xlsx", "_flat_biomaterial.csv"))
flat_biomaterial = pd.read_csv(path + "/flat_dcp/"  + file_name.replace(".xlsx", "_denormalised_biomaterial.csv"))

In [8]:
flat_tier1 = flat_biomaterial.rename(columns=dcp_to_tier1_mapping)\
    .drop(columns=[col for col in flat_biomaterial if col not in dcp_to_tier1_mapping.keys()])

In [9]:
if 'sample_source' in flat_tier1.keys():
    # flat_tier1.loc[(flat_tier1['sample_source'] == 'yes') & \
    #     (flat_tier1['sample_collection_method'].isin(['brush', 'scraping', 'biopsy', 'blood draw', 'body fluid', 'other'])),\
    #         'sample_source'] = 'surgical donor'
    # Normally we would use the specimen_from_organism.transplant_organ field but since it is not populated yet, we use the sample_collection_method
    # In this case however, we have nephrectomy for all donors which is a surgical recession but is also a surgical donation. Therefore, we manually change that here
    flat_tier1['sample_source'] = 'surgical donor'

In [10]:
if 'sex_ontology_term_id' in flat_tier1.keys():
    sex_ontology = {
        'sex_ontology_term_id':
            {
                'female': 'PATO:0000383',
                'male': 'PATO:0000384'
            }           
    }
    flat_tier1.replace(sex_ontology, inplace=True)

In [11]:
def library_to_tissue_type(library_id, dcp_spreadsheet):
    tissue_type_dict = {
        'specimen_from_organism.biomaterial_core.biomaterial_id': 'tissue',
        'cell_line.biomaterial_core.biomaterial_id': 'cell culture',
        'organoid.biomaterial_core.biomaterial_id': 'organoid'
    }
    row = dcp_spreadsheet['Cell suspension'][dcp_spreadsheet['Cell suspension']['cell_suspension.biomaterial_core.biomaterial_id'] == library_id]
    tissue_type = [tissue_type for dcp_type, tissue_type in tissue_type_dict.items() if dcp_type in row.columns and not any(row[dcp_type].isna())]
    if len(tissue_type) > 1:
        raise ValueError(f'Multiple input biomaterials for {library_id}')
    return tissue_type[0]

flat_tier1['tissue_type'] = flat_tier1['library_id'].apply(lambda x: library_to_tissue_type(x, dcp_spreadsheet))

In [12]:
flat_tier1['sampled_site_condition'] = np.nan
flat_tier1.loc[(['sampled_site_condition_donor'] == 'normal') & (flat_tier1['sampled_site_condition_specimen'] == 'normal'), 'sampled_site_condition'] = 'healthy'
flat_tier1.loc[(['sampled_site_condition_donor'] != 'normal') & (flat_tier1['sampled_site_condition_specimen'] == 'normal'), 'sampled_site_condition'] = 'adjacent'
flat_tier1.loc[(['sampled_site_condition_donor'] != 'normal') & (flat_tier1['sampled_site_condition_specimen'] != 'normal'), 'sampled_site_condition'] = 'diseased'

  flat_tier1.loc[(['sampled_site_condition_donor'] == 'normal') & (flat_tier1['sampled_site_condition_specimen'] == 'normal'), 'sampled_site_condition'] = 'healthy'


In [13]:
def age_to_dev(age):
    # TODO add more options
    # Unknown = unknown
    # Embryonic stage = A term from the set of Carnegie stages 1-23 = (up to 8 weeks after conception; e.g. HsapDv:0000003)
    # Fetal development = A term from the set of 9 to 38 week post-fertilization human stages = (9 weeks after conception and before birth; e.g. HsapDv:0000046)
    # Post natal =
    age_to_dev_dict = {
        (0, 14): 'HsapDv:0000264',
        (15, 19): 'HsapDv:0000268',
        (20, 29): 'HsapDv:0000237',
        (30, 39): 'HsapDv:0000238',
        (40, 49): 'HsapDv:0000239',
        (50, 59): 'HsapDv:0000240',
        (60, 69): 'HsapDv:0000241',
        (70, 79): 'HsapDv:0000242',
        (80, 89): 'HsapDv:0000243'
    }
    for age_range, label in age_to_dev_dict.items():
        if age_range[0] <= age <= age_range[1]:
            return label

if (flat_tier1['organism_ontology_term_id'] == 9606).all():
    flat_tier1['development_stage_ontology_term_id'] = flat_tier1['development_stage_ontology_term_id'].apply(age_to_dev)
else:
    print("Please convert the age to developomental stage ontology")



In [14]:
# For this dataset the preservation and storage method has not been submitted, but it is mentioned in the article that the samples were snap frozen with liquid nitrogen & stored at -80C
flat_tier1['sample_preservation_method'] = 'frozen in liquid nitrogen'

In [15]:
suspension_type_dict = {
    'suspension_type':
    {
        'single cell': 'cell',
        'single nucleus': 'nucleus'
    }
}
flat_tier1.replace(suspension_type_dict, inplace=True)

In [16]:
cell_enrichment_dict = {
    'cell_enrichment':
    {
        'DAPI-': 'na',
        np.nan: 'na'
    }
}
flat_tier1.replace(cell_enrichment_dict, inplace=True)

In [17]:
flat_tier1['is_primary_data'] = True

In [18]:
# flat_tier1['alignment_software'] = flat_tier1['alignment_software'] & flat_tier1['alignment_software_version']
flat_tier1['alignment_software'] = 'Cellranger v3.0.2'

In [19]:
unique_diseases = flat_tier1['disease_ontology_term_id'].str.split("\\|\\|", expand=True, n=1).drop_duplicates().dropna()
if unique_diseases.shape[1] > 1:
    selected_disease = ", ".join(np.unique(unique_diseases[0]))
    unselected_diseases = " and ".join(unique_diseases[1])
    print(f"From pooled diseases, we will use {selected_disease}, instead of {unselected_diseases}")

    flat_tier1['disease_ontology_term_id'] = flat_tier1['disease_ontology_term_id'].str.split("\\|\\|").str[0]

From pooled diseases, we will use MONDO:0005005, instead of HP:0002716


## Fine-tune obs dataframe

In [20]:
flat_tier1.drop(columns=['sampled_site_condition_donor', 'sampled_site_condition_specimen'], inplace=True)

In [21]:
# Template is at sample level, therefore the library level fields will be collapsed with comma
def collapse_values(series):
    return ", ".join(series.astype(str))

flat_tier1_sample = flat_tier1.groupby('sample_id').agg(collapse_values)

## Export to xlsx and `csv`s

In [22]:
tier1_output = "tier1_output/" + file_name.replace(".xlsx", "_Tier1.xlsx")
! cp 'HCA_Tier 1_ Technical Metadata template_v0.1.xlsx' $tier1_output

In [23]:
tier1_spreadsheet = {}
tier1_spreadsheet['Tier 1_uns'] = uns
tier1_spreadsheet['Tier 1_obs'] = flat_tier1_sample

In [24]:
excel_dict = pd.read_excel(tier1_output, header=1, sheet_name=None)
for sheet_name, excel_df in excel_dict.items():
    if sheet_name in tier1_spreadsheet:
        py_df = tier1_spreadsheet[sheet_name]
        excel_dict[sheet_name] = pd.concat([excel_df, py_df])

In [25]:
with pd.ExcelWriter(tier1_output) as writer:
    for tab in excel_dict:
        excel_dict[tab].to_excel(writer, sheet_name=tab, index=False)

In [26]:
tier1_spreadsheet['Tier 1_uns'].to_csv("tier1_output/" + file_name.replace('.xlsx', '_tier1_uns.csv'))
tier1_spreadsheet['Tier 1_obs'].to_csv("tier1_output/" + file_name.replace('.xlsx', '_tier1_obs.csv'))