In [None]:
import os 
import pandas as pd
from IPython.display import display, HTML

# set the output directory
outdir = '../../results/geo_plus_cellosaurus/'
os.makedirs(outdir, exist_ok=True)

## Load tracker information

In [None]:
# load tracker information
human_tracker = pd.read_excel('../../results/tracker/HiChIP-Tracker.xlsx', sheet_name='Human Tracker')
mouse_tracker = pd.read_excel('../../results/tracker/HiChIP-Tracker.xlsx', sheet_name='Mouse Tracker')
tracker = pd.concat([human_tracker, mouse_tracker], axis=0)

# extract cellosaurus id and gsm id
cello_to_gsm = tracker[['Cellosaurus ID', 'GSM ID']]
cello_to_gsm = cello_to_gsm.drop_duplicates()
cello_to_gsm = cello_to_gsm.loc[(~cello_to_gsm['Cellosaurus ID'].isna()) & \
                                (cello_to_gsm['Cellosaurus ID'] != 'Not Available')]
cello_to_gsm.columns = ['cellosaurus_id', 'gsm_id']

## Load GEO information

In [None]:
# load geo metadata results
geo = pd.read_table('../../results/geo_celltypes/geo.query.cell_type.2023_02_27.tsv')
display(geo.shape)

# add cellosaurus id to geo queries
geo = geo.merge(cello_to_gsm, left_on='geo_id', right_on='gsm_id', how='left', indicator=True)
display(geo.shape)

# extract cell line and others
geo_cell_lines = geo.loc[geo['_merge'] == 'both']
geo_others = geo.loc[geo['_merge'] == 'left_only']

display(geo_cell_lines.shape)
display(geo_others.shape)

## Load Cellosaurus information

In [None]:
# load cellosaurus metadata
cellosaurus = pd.read_table('../../results/cellosaurus/cellosaurus_metadata.tsv')
cellosaurus.columns = ['id',
                        'cell_line_name',
                        'organ_andor_tissue',
                        'cell_type',
                        'disease',
                        'sex',
                        'age',
                        'category',
                        'species']
cellosaurus.columns = 'cellosaurus_' + cellosaurus.columns
cellosaurus = cellosaurus.loc[cellosaurus['cellosaurus_id'].str.startswith('CVCL')]

## Merge GEO and Cellosaurus cell line information

Some information will be replace with cellosaurus but not all. 
- Cellosaurus precidence
  - name 
  - biomaterial
  - tissue
  - sex
  - age
  - cell type
- GEO precidence
  - disease
  - organ

In [None]:
# merge geo and cellosaurus
geo_cl_metadata = geo_cell_lines.merge(cellosaurus, on='cellosaurus_id', how='left')

# Replace geo metadata with cellosaurus metadata
geo_cl_metadata['name'] = geo_cl_metadata['cellosaurus_cell_line_name']
geo_cl_metadata['biomaterial'] = 'Cell Line' # sometimes this was incorrectly labeled as 'Other'
geo_cl_metadata['tissue'] = geo_cl_metadata['cellosaurus_organ_andor_tissue']
geo_cl_metadata['sex'] = geo_cl_metadata['cellosaurus_sex']
geo_cl_metadata['age'] = geo_cl_metadata['cellosaurus_age']

# cell type data includes a check for 'Cell type not specified' 
# because some cell lines may have been defined within geo. 
def replace_celltype(sr):
    if sr['cellosaurus_cell_type'] == 'Cell type not specified':
        return sr['celltype']
    else:
        return sr['cellosaurus_cell_type']
geo_cl_metadata['celltype'] = geo_cl_metadata.apply(replace_celltype, axis=1)

# done with cellosaurus columns, dropping them
geo_cl_metadata = geo_cl_metadata.iloc[:, (~geo_cl_metadata.columns.str.startswith('cellosaurus')) | \
                                        (geo_cl_metadata.columns.str.match('cellosaurus_id'))]

In [None]:
geo_cl_metadata.head()

## Generate the final metadata files

In [None]:
# concatenate the non- and cell line entries
final_metadata = pd.concat([geo_others, geo_cl_metadata])

# fix lowercase and unharmmonized sex information
final_metadata.loc[:, 'sex'] = final_metadata['sex'].str.capitalize()
final_metadata.loc[:, 'sex'] = final_metadata['sex'].replace({'Sex unspecified': 'Undetermined'})

# Set default values for NaN values
final_metadata.loc[final_metadata.disease.isna(), 'disease'] = 'N/A'
final_metadata.loc[final_metadata.organ.isna(), 'organ'] = 'Undetermined'
final_metadata.loc[final_metadata.tissue.isna(), 'tissue'] = 'Undetermined'
final_metadata.loc[final_metadata.celltype.isna(), 'celltype'] = 'Undetermined'
final_metadata.loc[(final_metadata.strain.isna()) & (final_metadata.organism == "Homo sapiens"), 'strain'] = 'N/A'
final_metadata.loc[(final_metadata.strain.isna()) & (final_metadata.organism != "Homo sapiens"), 'strain'] = 'Undetermined'
final_metadata.loc[final_metadata.sex.isna(), 'sex'] = 'Undetermined'
final_metadata.loc[final_metadata.age.isna(), 'age'] = 'Undetermined'

# # reset the index and use as an id for mapping
# final_metadata.reset_index(drop=True, inplace=True)

In [None]:
final_metadata.shape

In [None]:
final_metadata

## Create the celltype insert table

In [None]:
# filter for celltype table column
celltype_cols = ['name', 'organism', 'biomaterial',
                 'disease', 'organ', 'tissue', 'celltype',
                 'strain', 'sex', 'age', 'cellosaurus_id', 'geo_id']
celltype_insert = final_metadata[celltype_cols] #.drop_duplicates()

In [None]:
celltype_insert.head()

#### Check the distribution of metadata values (per column)

In [None]:
check_cols = ['organism', 'biomaterial',
                 'disease', 'organ', 'tissue', 'celltype',
                 'strain', 'sex', 'age', 'cellosaurus_id']
for col in check_cols:
    display(HTML('<h1> {} </h1>'.format(col)))
    display(celltype_insert[col].value_counts().to_frame())
    display()

In [None]:
celltype_fn = os.path.join(outdir, 'celltype_insert.with_geo.tsv')
celltype_insert.to_csv(celltype_fn, sep='\t', index=False)

## Generate the celltype to hichip_sample insert table

Need a way to keep the GSM to celltype mapping. Created it, using a temporary id by resetting the index and 
saving this when deduplicating the celltype table.
1) merge the celltype with hichip_sample_table using gsm_ids
2) save the celltype and hichip_sample UUIDs only and upload with a link

This work will have to be one within the loopcatalog repository: 
https://gitlab.lji.org/ay-lab-team/loopcatalog/-/tree/main/hichip_website/hichip_website/import_data/scripts
