In [1]:
import pandas as pd
from numpy import nan
import os
import requests

from tier1_to_dcp_dict import tier1_to_dcp

In [2]:
collection_id = 'bcb61471-2a44-4d00-a0af-ff085512674c'
dataset_id = '0b75c598-0893-4216-afe8-5414cab7739d'

In [3]:
study_metadata = pd.read_csv("study_metadata.csv", header=None).T
study_metadata.columns = study_metadata.iloc[0]
study_metadata.drop(0, axis=0, inplace=True)
sample_metadata = pd.read_csv(f"metadata/{collection_id}_{dataset_id}_metadata.csv")

In [4]:
hca_template_url = 'https://github.com/ebi-ait/geo_to_hca/raw/master/template/hca_template.xlsx'
hca_template_file = requests.get(hca_template_url)
dcp_spreadsheet = pd.read_excel(hca_template_file.content, sheet_name=None, skiprows= [0,1,2,4])
dcp_headers = pd.read_excel(hca_template_file.content, sheet_name=None, header=None)
for tab in dcp_headers:
    dcp_headers[tab].rename(columns=dcp_headers[tab].iloc[3], inplace= True)

In [5]:
Project_Publications = {key:[nan] for key in dcp_spreadsheet['Project - Publications'].keys()}
Project_Publications['project.publications.doi'] = study_metadata['doi'].tolist()
dcp_spreadsheet['Project - Publications'] = pd.DataFrame(Project_Publications)

In [6]:
def reformat_column_values(column_name, value):
    if value == "Yes":
        return column_name
    else:
        return nan

for column in sample_metadata.columns:
    if "diabetes" in column.lower() or "hypertension" in column.lower():
        sample_metadata[column] = sample_metadata.apply(lambda row: reformat_column_values(column, row[column]), axis=1)

sample_dcp_fields = sample_metadata.rename(columns=tier1_to_dcp)

In [7]:
for tab in dcp_spreadsheet:
    keys_union = [key for key in dcp_spreadsheet[tab].keys() if key in sample_dcp_fields.keys()]
    # if tab contains only the input biomaterial name, then skip the tab
    if (len(keys_union) == 1) and (tab.lower().replace(" ", "_") != keys_union[0].split(".")[0]):
        continue
    # collapse arrays in duplicated columns
    if any(sample_dcp_fields[keys_union].columns.duplicated()):
        for dub_cols in set(sample_dcp_fields[keys_union].columns[sample_dcp_fields[keys_union].columns.duplicated()]):
            df = sample_dcp_fields[dub_cols]
            sample_dcp_fields.drop(columns=dub_cols, inplace=True)
            sample_dcp_fields[dub_cols] = df[dub_cols].apply(lambda x: '||'.join(x.dropna().astype(str)),axis=1)

    dcp_spreadsheet[tab] = pd.concat([dcp_spreadsheet[tab],sample_dcp_fields[keys_union]])
    dcp_spreadsheet[tab].dropna(how='all', inplace=True)

In [17]:
sample_metadata

Unnamed: 0,library,organism,organism_ontology_term_id,donor_id,sex,sex_ontology_term_id,development_stage,development_stage_ontology_term_id,self_reported_ethnicity,self_reported_ethnicity_ontology_term_id,...,eGFR.1,experiment.1,hypertension.1,id.1,region.l1.1,region.l2.1,specimen.1,percent.cortex.1,percent.medulla.1,obs count
0,"('KB1', 'KB1')",Homo sapiens,NCBITaxon:9606,31-10001,male,PATO:0000384,eighth decade human stage,HsapDv:0000242,European,HANCESTRO:0005,...,40-49,KPMP_20191204A_10X-R,hypertension.1,reference,Medulla,C-M,S-1908-000952-R1,40.0,60.0,
1,"('KB10', 'KB10')",Homo sapiens,NCBITaxon:9606,31-10000,male,PATO:0000384,sixth decade human stage,HsapDv:0000240,European,HANCESTRO:0005,...,20-29,KPMP_20200212B_10X-R,hypertension.1,reference,Medulla,C-M,S-1908-000905_R1,45.0,55.0,
2,"('KB11', 'KB11')",Homo sapiens,NCBITaxon:9606,33-10005,male,PATO:0000384,seventh decade human stage,HsapDv:0000241,European,HANCESTRO:0005,...,>60,KPMP_20200212C_10X-R,hypertension.1,reference,Medulla,C-M,S-1908-009843_R1,30.0,70.0,
3,"('KB12', 'KB12')",Homo sapiens,NCBITaxon:9606,32-10034,male,PATO:0000384,seventh decade human stage,HsapDv:0000241,European,HANCESTRO:0005,...,>60,KPMP_20200212D_10X-R,,reference,Cortex,C,S-1908-010125_R1,100.0,0.0,
4,"('KB13', 'KB13')",Homo sapiens,NCBITaxon:9606,33-10006,male,PATO:0000384,third decade human stage,HsapDv:0000237,European,HANCESTRO:0005,...,>60,KPMP_20200212E_10X-R,,reference,Cortex,C,S-1908-009890_R1,100.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,"('1158EO-1', '1158EO-1')",Homo sapiens,NCBITaxon:9606,Sample1158-EO1,female,PATO:0000383,48-year-old human stage,HsapDv:0000142,European,HANCESTRO:0005,...,>60,PREMIERE45,,query,,,Sample1158-EO1,,,
89,"('1158EO-2', '1158EO-2')",Homo sapiens,NCBITaxon:9606,Sample1158-EO2,male,PATO:0000384,56-year-old human stage,HsapDv:0000150,European,HANCESTRO:0005,...,>60,PREMIERE46,,query,,,Sample1158-EO2,,,
90,"('1158EO-3', '1158EO-3')",Homo sapiens,NCBITaxon:9606,Sample1158-EO3,female,PATO:0000383,35-year-old human stage,HsapDv:0000129,European,HANCESTRO:0005,...,>60,PREMIERE47,,query,,,Sample1158-EO3,,,
91,"('1162EO-1', '1162EO-1')",Homo sapiens,NCBITaxon:9606,Sample1162-EO1,male,PATO:0000384,39-year-old human stage,HsapDv:0000133,European,HANCESTRO:0005,...,>60,PREMIERE48,,query,,,Sample1162-EO1,,,


In [8]:
with pd.ExcelWriter(f"metadata/{collection_id}_{dataset_id}_dcp.xlsx") as writer:
    for tab in dcp_spreadsheet:
        if not dcp_spreadsheet[tab].empty:
            pd.concat([dcp_headers[tab], dcp_spreadsheet[tab]]).to_excel(writer, sheet_name=tab, index=False, header=False)


InvalidIndexError: Reindexing only valid with uniquely valued Index objects