In [1]:
import os 
import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import requests
import re

import pandas as pd
pd.set_option('display.max_rows', 500)
import numpy as np
from lxml import etree
from Bio import Entrez
import GEOparse

import logging
logging.getLogger('GEOparse').setLevel(logging.WARNING)

import json
from IPython.display import display, HTML

# set a dummy email 
Entrez.email = "<your-email-address>"

outdir = '../../results/geo_celltypes/'
gsm_dir = os.path.join(outdir, 'individual_gsm/')
os.makedirs(gsm_dir, exist_ok=True)

In this notebook we are leveraging the Entrez library to query the **BioSamples** database which contains 
lots of metadata for the samples we are interested in. We start from the output of `../query_geo_v2.ipynb` which 

## Download the SOFT Report for Every GSM

This step is slow the first time but can easily be rerun once all of the data has been downloaded.

In [2]:
# open file and read the content in a list
gsm_ids = []
with open('../../results/geo_celltypes/gsm_list.human.tracker.txt', 'r') as fp:
    gsm_ids += sorted(set([x.strip() for x in fp.readlines()]))

with open('../../results/geo_celltypes/gsm_list.mouse.tracker.txt', 'r') as fp:
    gsm_ids += sorted(set([x.strip() for x in fp.readlines()]))

gsm_ids = [x for x in gsm_ids if x != 'N/A']

In [3]:
len(gsm_ids)

1092

In [4]:
# download full when avaiable
for gsm_id in gsm_ids:
    try:
        GEOparse.get_GEO_file(gsm_id, destdir=gsm_dir, how='full')
    except OSError as e:
        print(e)

# download brief when full is NOT available
for gsm_id in gsm_ids:
    try:
        gsm_fn = os.path.join(gsm_dir, '{}.txt'.format(gsm_id))
        if not os.path.exists(gsm_fn):
            print('Downloading: {}'.format(gsm_id))
            GEOparse.get_GEO_file(gsm_id, destdir=gsm_dir, how='brief')
    except OSError as e:
        print(e)

In [5]:
# load the gsm data
gsm_data = {}
for gsm_id in gsm_ids:
        gsm_data[gsm_id] = GEOparse.get_GEO(gsm_id, destdir=gsm_dir)

## Get Metadata Based on the GEO Entries

In [6]:
# load a dictionary of terms used to  classify organ, disease, and cell type
# as best as possible
with open('metadata_dictionary.json', 'r') as fp:
    celltype_dict = json.load(fp)

In [7]:
# creating an example dictionary to store the metadata for each GSM entry
empty_data = {'name': np.nan, # got it 
                  'organism': np.nan, # got it 
                  'biomaterial': np.nan, # not the best but ok, got it
                  'celltype': np.nan, # not the best but ok, got it
                  'extdb_name': np.nan, # use this to store the GEO Name
                  'extdb_uuid': np.nan, # use this to store the GEO ID
                  'gsm_id': np.nan, # use this to store the GEO ID explicitly for mapping purposes
                  'sra_id': np.nan,
                  'biosample_id': np.nan,
                  'disease': np.nan, # difficult, don't count on finding it 
                  'organ': np.nan, # difficult, don't count on finding it 
                  'treatment': np.nan, # difficult, don't count on finding it
                  'tissue': np.nan, # extremely difficult, don't count on finding it AT ALL
                  'sex': np.nan, # not found, can I find somewhere else?, don't count on finding it 
                  'age': np.nan, # not found, can I find somewhere else?, don't count on finding it 
                  'race': np.nan, # found from time to time
                  'strain': np.nan} # only for mouse} 

In [8]:
def check_for_celline_in_gsm(gsm_obj, celltype_dict):
    
    # case when None
    if gsm_obj is None:
        return(False)
    
    # found mention of cell line
    if 'characteristics_ch1' in gsm_obj.metadata:
        for x in gsm_obj.metadata['characteristics_ch1']:
            if 'cell line' in x.lower():
                return(True)
    
    # find mention within the title
    title = ' '.join(gsm_obj.metadata['title'])
    title = title.lower()
    for x in celltype_dict['biomaterial']['cell line']:
        if x in title:
            return(True)
    
    # find mention within the source
    source = ' '.join(gsm_obj.metadata['source_name_ch1'])
    source = source.lower()
    for x in celltype_dict['biomaterial']['cell line']:
        if x in source:
            return(True)
    
    return(False)

# helper function to capitalize words
def capitalize_words(s):
    return(' '.join([w.capitalize() for w in s.split()]))

In [9]:
# initialize lists to store the metadata
gsm_celltype_metadata = []

# load the gsm data
for i, gsm_id in enumerate(gsm_data.keys()):
        
    ##### init information #####

    # get the current GSM data
    gsm_obj = gsm_data[gsm_id]
        
    # initialize the dictionary to store the metadata
    gsm_entry_data = empty_data.copy()
    
    ##### defining easily accessible fields #####
    gsm_entry_data['name'] = gsm_obj.metadata['title'][0]
    gsm_entry_data['organism'] = gsm_obj.metadata['organism_ch1'][0]
    gsm_entry_data['disease'] = 'Undetermined'
    gsm_entry_data['tissue'] = 'Undetermined'
    gsm_entry_data['celltype'] = 'Undetermined'
    gsm_entry_data['strain'] = 'Undetermined'
    gsm_entry_data['sex'] = 'Undetermined'
    gsm_entry_data['age'] = 'Undetermined'
    gsm_entry_data['race'] = 'Undetermined'
    gsm_entry_data['extdb_name'] = ''
    gsm_entry_data['extdb_uuid'] = ''
    gsm_entry_data['geo_id'] = gsm_obj.metadata['geo_accession'][0]
    gsm_entry_data['sra_id'] = 'TBD'
    gsm_entry_data['biosample_id'] = 'Not Found' # holder until the merging where it'll become the default
    
    ##### adding the biomaterial #####
    if check_for_celline_in_gsm(gsm_obj, celltype_dict):
        biomaterial = 'Cell Line'
    else:
        biomaterial = 'Primary Sample (Under Review)'
    gsm_entry_data['biomaterial'] = biomaterial
    
    
    ##### identify candidate classes for the organ column #####

    # extract certain metadata for analysis
    gsm_str = ''
    if gsm_obj is not None:
        gsm_str = ' '.join(gsm_obj.metadata['title']) + ' '
        gsm_str += ' '.join(gsm_obj.metadata['source_name_ch1']) + ' '
        gsm_str += ' '.join(gsm_obj.metadata['characteristics_ch1'])
        gsm_str = gsm_str.lower()
        
    # check each term within an organ to assign candidate classes
    celltype_col = 'organ'
    candidate_classes = []
    for curr_class in celltype_dict[celltype_col]:
        synonyms = celltype_dict[celltype_col][curr_class]
        found = any(re.search(r'(?<!-)\b{}'.format(syn), gsm_str) for syn in synonyms) # regex version
        if found:
            candidate_classes.append(curr_class)

    # if there are no candidate classes, then assign as N/A
    if len(candidate_classes) == 0:
        gsm_entry_data[celltype_col] = np.nan      
    # if there is only one candidate class, then use it
    elif len(candidate_classes) == 1:
        gsm_entry_data[celltype_col] = capitalize_words(candidate_classes[0])
    else:
        # if there are multiple candidate classes, then all with commas
        multiple_classes = capitalize_words(', '.join(candidate_classes))
        gsm_entry_data[celltype_col] = multiple_classes     
            
    # append the GSM data to the table    
    gsm_celltype_metadata.append(gsm_entry_data)
        
    # indicate how many GSM are processed and save the table as the loop runs
    if (i % 100 == 0):
        print("Finished GSM index:", i)

Finished GSM index: 0
Finished GSM index: 100
Finished GSM index: 200
Finished GSM index: 300
Finished GSM index: 400


Finished GSM index: 500
Finished GSM index: 600
Finished GSM index: 700
Finished GSM index: 800
Finished GSM index: 900
Finished GSM index: 1000


In [10]:
cols = ['name', 'organism', 'biomaterial', 'disease',
        'organ', 'tissue', 'celltype', 'strain',
        'sex', 'age', 'race', 'extdb_name', 'extdb_uuid', 'geo_id', 'sra_id', 'biosample_id']
gsm_cell_type_df = pd.DataFrame(gsm_celltype_metadata, columns=cols)

In [11]:
gsm_cell_type_df

Unnamed: 0,name,organism,biomaterial,disease,organ,tissue,celltype,strain,sex,age,race,extdb_name,extdb_uuid,geo_id,sra_id,biosample_id
0,HiChIP GM biological replicate 1 technical rep...,Homo sapiens,Cell Line,Undetermined,Immune-associated,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM2138324,TBD,Not Found
1,HiChIP GM biological replicate 1 technical rep...,Homo sapiens,Cell Line,Undetermined,Immune-associated,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM2138325,TBD,Not Found
2,HiChIP GM biological replicate 2 technical rep...,Homo sapiens,Cell Line,Undetermined,Immune-associated,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM2138326,TBD,Not Found
3,HiChIP GM biological replicate 2 technical rep...,Homo sapiens,Cell Line,Undetermined,Immune-associated,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM2138327,TBD,Not Found
4,HiChIP-H3K27ac-control sgRNA-rep1,Homo sapiens,Cell Line,Undetermined,Mammary Gland,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM2572593,TBD,Not Found
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1087,Foxa2_HiChIP_GW,Mus musculus,Primary Sample (Under Review),Undetermined,Liver,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM7148379,TBD,Not Found
1088,"gg8tTAtetOP2, H3K27ac HiChIP, r1",Mus musculus,Primary Sample (Under Review),Undetermined,"Brain, Nose",Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM7213981,TBD,Not Found
1089,"gg8tTAtetOP2, H3K27ac HiChIP, r2",Mus musculus,Primary Sample (Under Review),Undetermined,"Brain, Nose",Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM7213982,TBD,Not Found
1090,L2-3 H3K27ac HiChIP,Mus musculus,Cell Line,Undetermined,,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,Undetermined,,,GSM7336686,TBD,Not Found


## Extract the BioSample IDs from GSM SOFT Files

In [12]:
# get biosample ids
def get_biosample(gsm_obj):
    biosample_id = None
    
    for x in gsm_obj.metadata['relation']:
        if x.startswith('BioSample'):
            biosample_id = x.split()[1].split('/')[-1]  
    return(biosample_id)

# extract the id and create a version without SAMN prefixes
biosample_ids = [get_biosample(g) for g in gsm_data.values()]
biosample_ids_nosamn = [x.replace('SAMN', '') for x in biosample_ids]

## Query the ESummary API for BioSample Entry

In [13]:
# define the base URL for the efetch command
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'

# defining errors to catch
empty_result = b'<?xml version="1.0" encoding="UTF-8" ?>\n'
error_results = b'{"error":"error forwarding request","api-key":"8.37.117.239","type":"ip",\n"status":"ok"}'

biosample_tree = None
intervals = 100 # needs to stay 100, this api cannot handle much more than 100
for start in range(0, len(biosample_ids_nosamn), intervals):

    end = start + intervals
    print(start, end)

    # define the parameters for the efetch command
    params = {
        'db': 'biosample',
        'id': ','.join(biosample_ids_nosamn[start:end]),
        'retmode': 'xml',
        'rettype': 'DocumentSummarySet'
    }

    # send the request to the E-utilities API and get the response
    response = requests.get(base_url, params=params)
    
    if response  in [empty_result, '', error_results]:
        print('Count not find data for these samples. Moving on.')
        continue

    if biosample_tree == None:
        biosample_tree = etree.fromstring(response.content)
    else:
        new_tree = etree.fromstring(response.content)

        # join the second XML tree with the main one
        biosample_tree.extend(new_tree)
    

0 100


100 200
200 300
300 400
400 500
500 600
600 700
700 800
800 900
900 1000
1000 1100


## Find Metadata Based on the BioSample Entries

In [14]:
# helper function to extract an XML element or return an empty string
def get_xml_element_or_default(start_element, query, text=True):
    element = start_element.find(query)
    if element is not None:
        if text:
            return(element.text)
        else:
            return(element)
    return(np.nan)

In [15]:
# initialize lists to store the metadata
bs_celltype_metadata = []

# iterate through the list of biosample entries
for i, doc_sum in enumerate(biosample_tree.findall(".//DocumentSummary")):

    # initialize the dictionary to store the metadata
    biosample_data = empty_data.copy()

    ############ Extract Values from ESummary ############
    # transform the XML text into an XML tree, for some reason the "<" and ">"
    # are sometimes not parsed correctly, so we needed to reread the data
    # into a tree

    # extract the SampleData XML element
    sample_data = doc_sum.find('.//SampleData')

    # make sample data tree
    sample_tree = etree.fromstring(bytes(sample_data.text.encode('utf-8')))

    # # testing and printing xml values per sample
    # for elem in sample_tree:
    #     for subelem in elem:
    #         if subelem.tag == 'Attribute':
    #             print(subelem.tag, subelem.attrib, subelem.text)
    #     print()
    # if i > 1000:
    #     break
    # continue

    # extract the Ids for this sample
    ids = sample_tree.find("Ids")
    biosample_data['biosample_id'] = ids.find("Id[@db='BioSample']").text
    
    if ids.find("Id[@db='GEO']") is not None:
        biosample_data['geo_id'] = ids.find("Id[@db='GEO']").text
    else:
        biosample_data['geo_id'] = 'Undetermined' # may be able to determine using GEOparse results
        
    biosample_data['sra_id'] = ids.find("Id[@db='SRA']").text

    # extract the attributes for this sample
    attributes = sample_tree.find("Attributes")

    # obtain the biomaterial
    # first checking the biosample metadata
    # then checking the gsm metadata
    if attributes.find('Attribute[@harmonized_name="cell_line"]') is not None:
        biomaterial = 'Cell Line'
    else:
        biomaterial = 'Primary Sample (Under Review)'
    biosample_data['biomaterial'] = biomaterial

    # obtain the cell type 
    celltype = get_xml_element_or_default(attributes, 'Attribute[@harmonized_name="cell_type"]', text=True)
    biosample_data['celltype'] = celltype

    # obtain the treatment
    treatment = get_xml_element_or_default(attributes, 'Attribute[@harmonized_name="treatment"]', text=True)
    biosample_data['treatment'] = treatment

    # obtain the tissue
    tissue = get_xml_element_or_default(attributes, 'Attribute[@harmonized_name="tissue"]', text=True)
    biosample_data['tissue'] = tissue

    # obtain the sample source (CHECK OUT: not stored yet)
    sample_source = get_xml_element_or_default(attributes, 'Attribute[@harmonized_name="source_name"]', text=True)
    
    # obtain the name/title
    title = sample_tree.find("Description/Title").text
    biosample_data['name'] = title

    # obtain the organism
    organism = sample_tree.find("Description/Organism").get('taxonomy_name')
    biosample_data['organism'] = organism

    strain = get_xml_element_or_default(attributes, 'Attribute[@harmonized_name="strain"]', text=True)
    biosample_data['strain'] = strain

    #organ = sample_tree.find('Attribute [@harmonized_name="cell_type"]').text
    sex = get_xml_element_or_default(attributes, 'Attribute[@harmonized_name="sex"]', text=True)
    biosample_data['sex'] = sex
    
    
    ############ Assign Metadata Based on Dictionary ############
    # attempt to automatically classify 'disease' and 'organ' columns by doing a simple 
    # search of keywords within the whole SampleData XML text
    data_str = sample_data.text.lower()

    # extract the title, attributes and description lowercase strings
    title_str = doc_sum.find("Title").text.lower()
    attributes_str = etree.tostring(attributes).decode().lower()
    description = sample_tree.find("Description")
    description_str = etree.tostring(description).decode().lower()

    # final string to check
    check_str = ' '.join([title_str, attributes_str, description_str])
            
    for celltype_col in  ['disease', 'organ']:

        # identify candidate classes for each meta column
        candidate_classes = []
        for curr_class in celltype_dict[celltype_col]:
            synonyms = celltype_dict[celltype_col][curr_class]
            found = any(re.search(r'(?<!-)\b{}'.format(syn), check_str) for syn in synonyms) # regex version
            # simple version found = any(syn in check_str for syn in synonyms)
            if found:
                candidate_classes.append(curr_class)
        
        # if there are no candidate classes, then assign as N/A
        if len(candidate_classes) == 0:
            biosample_data[celltype_col] = np.nan      
        # if there is only one candidate class, then use it
        elif len(candidate_classes) == 1:
            biosample_data[celltype_col] = capitalize_words(candidate_classes[0])
        else:
            # if there are multiple candidate classes, then all with commas
            multiple_classes = capitalize_words(', '.join(candidate_classes))
            biosample_data[celltype_col] = multiple_classes     
            
    # append the GSM data to the table    
    bs_celltype_metadata.append(biosample_data)
    
    # indicate how many GSM are processed and save the table as the loop runs
    if (i % 100 == 0):
        print("Finished Biosample number:", i)

Finished Biosample number: 0
Finished Biosample number: 100
Finished Biosample number: 200
Finished Biosample number: 300
Finished Biosample number: 400
Finished Biosample number: 500
Finished Biosample number: 600
Finished Biosample number: 700
Finished Biosample number: 800
Finished Biosample number: 900
Finished Biosample number: 1000


In [16]:
cols = ['name', 'organism', 'biomaterial', 'disease',
        'organ', 'tissue', 'celltype', 'strain',
        'sex', 'age', 'race', 'extdb_name', 'extdb_uuid', 'geo_id', 'sra_id', 'biosample_id']
bs_cell_type_df = pd.DataFrame(bs_celltype_metadata, columns=cols)

## Merge and Harmonize the GEO + BioSample Entries

In [17]:
# get unique gsm ids and data
gsm_id_set = set(gsm_cell_type_df.geo_id)
bs_id_set = set(bs_cell_type_df.geo_id)
gsm_only = gsm_id_set.difference(bs_id_set)
only_gsm_cell_type_df = gsm_cell_type_df.loc[gsm_cell_type_df['geo_id'].isin(gsm_only)]

In [18]:
only_gsm_cell_type_df.shape

(12, 16)

In [19]:
# get shared data
shared = gsm_id_set.intersection(bs_id_set)
shared_gsm_cell_type_df = bs_cell_type_df.loc[bs_cell_type_df['geo_id'].isin(shared)]

In [20]:
shared_gsm_cell_type_df.shape

(1080, 16)

In [21]:
# harmonize the organ and biomaterial columns
def harmonize_organ(gsm_id):
    
    full_list = []
    
    # add gsm data
    if type(gsm_tdf.loc[gsm_id].organ) == str:
        gsm_list = gsm_tdf.loc[gsm_id].organ.split(', ')
        full_list.extend(gsm_list)
    
    if type(bs_tdf.loc[gsm_id].organ) == str:
        bs_list = bs_tdf.loc[gsm_id].organ.split(', ')
        full_list.extend(bs_list)
        
    full_list = sorted(set(full_list))
    final_list = ', '.join(full_list)
    return(final_list)

def harmonize_biomaterial(gsm_id):
    
    # get gsm_biomaterial
    gsm_biomaterial = 'Primary Sample (Under Review)'
    if type(gsm_tdf.loc[gsm_id].biomaterial) == str:
        gsm_biomaterial = gsm_tdf.loc[gsm_id].biomaterial
    
    # get bs_biomaterial
    bs_biomaterial = 'Primary Sample (Under Review)'
    if type(bs_tdf.loc[gsm_id].biomaterial) == str:
        bs_biomaterial = bs_tdf.loc[gsm_id].biomaterial
        
    # harmonize
    if gsm_biomaterial == 'Cell Line' or bs_biomaterial == 'Cell Line':
        return('Cell Line')
    else:
        return(gsm_biomaterial)

In [22]:
gsm_tdf = gsm_cell_type_df.set_index('geo_id')
bs_tdf = bs_cell_type_df.set_index('geo_id')

shared_gsm_cell_type_df.loc[:, 'organ'] = shared_gsm_cell_type_df.geo_id.apply(harmonize_organ)
shared_gsm_cell_type_df.loc[:, 'biomaterial'] = shared_gsm_cell_type_df.geo_id.apply(harmonize_biomaterial)

In [23]:
cell_type_df = pd.concat([shared_gsm_cell_type_df, only_gsm_cell_type_df])

## Make Adjustments for the Final DF

#### Set default values for NaN values

In [24]:
cell_type_df.loc[cell_type_df.disease.isna(), 'disease'] = 'N/A'
cell_type_df.loc[cell_type_df.organ.isna(), 'organ'] = 'Undetermined'
cell_type_df.loc[cell_type_df.organ == '', 'organ'] = 'Undetermined'
cell_type_df.loc[cell_type_df.tissue.isna(), 'tissue'] = 'Undetermined'
cell_type_df.loc[cell_type_df.celltype.isna(), 'celltype'] = 'Undetermined'
cell_type_df.loc[(cell_type_df.strain.isna()) & (cell_type_df.organism == "Homo sapiens"), 'strain'] = 'N/A'
cell_type_df.loc[(cell_type_df.strain.isna()) & (cell_type_df.organism != "Homo sapiens"), 'strain'] = 'Undetermined'
cell_type_df.loc[cell_type_df.sex.isna(), 'sex'] = 'Undetermined'
cell_type_df.loc[cell_type_df.age.isna(), 'age'] = 'Undetermined'
cell_type_df.loc[cell_type_df.age.isna(), 'race'] = 'Undetermined'

#### Change Mus to Mus musculus

In [25]:
cell_type_df.loc[:, 'organism'] = cell_type_df.loc[:, 'organism'].str.replace('^Mus$', 'Mus musculus', regex=True)

## Review the Parsing Procedure

In [26]:
review_cols = ['organism',
               'biomaterial',
               'disease',
               'organ',
               'tissue',
               'celltype',
               'strain',
               'sex']
review_cols_simple = ['organism',
               'biomaterial',
               'organ']

In [27]:
for col in review_cols_simple:
    tdf = cell_type_df.value_counts(col)
    display(HTML('<h3>{}</h3>'.format(col)))
    display(tdf.to_frame())

Unnamed: 0_level_0,count
organism,Unnamed: 1_level_1
Homo sapiens,790
Mus musculus,302


Unnamed: 0_level_0,count
biomaterial,Unnamed: 1_level_1
Cell Line,742
Primary Sample (Under Review),350


Unnamed: 0_level_0,count
organ,Unnamed: 1_level_1
"Embryo, Stem-associated",121
Skeletal System,98
Undetermined,92
Immune-associated,89
Heart,60
Brain,55
"Pancreas, Stem-associated",54
Blood,54
Prostate,50
Stem-associated,50


## Checker

In [28]:
# Define the base URL for the efetch command
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'

# define the parameters for the efetch command
params = {
    'db': 'biosample',
    'id': '17391129', 
    'retmode': 'xml',
    'rettype': 'DocumentSummarySet'
}

# send the request to the E-utilities API and get the response
response = requests.get(base_url, params=params)
check_tree = etree.fromstring(response.content)


In [29]:
sample_data = check_tree.find('.//DocumentSummary/SampleData')

In [30]:
sample_tree = etree.fromstring(bytes(sample_data.text.encode('utf-8')))

In [31]:
etree.dump(sample_tree, pretty_print=True)

<BioSample access="public" publication_date="2021-05-20T00:00:00.000" last_update="2021-05-20T02:19:18.203" submission_date="2021-01-20T15:16:07.327" id="17391129" accession="SAMN17391129">   <Ids>     <Id db="BioSample" is_primary="1">SAMN17391129</Id>     <Id db="SRA">SRS8083433</Id>     <Id db="GEO">GSM5028232</Id>   </Ids>   <Description>     <Title>CD34 HiChIP</Title>     <Organism taxonomy_id="9606" taxonomy_name="Homo sapiens">       <OrganismName>Homo sapiens</OrganismName>     </Organism>   </Description>   <Owner>     <Name>Mullighan, Pathology, St Jude Children's Research Institute</Name>     <Contacts>       <Contact email="chunxu.qu@stjude.org">         <Name>           <First>Chunxu</First>           <Last>Qu</Last>         </Name>       </Contact>     </Contacts>   </Owner>   <Models>     <Model>Generic</Model>   </Models>   <Package display_name="Generic">Generic.1.0</Package>   <Attributes>     <Attribute attribute_name="source_name" harmonized_name="source_name" displ

### Save

In [32]:
# setting the output filename
today = datetime.date.today()
date_str = today.strftime("%Y_%m_%d")
now = datetime.datetime.now()

# set output name (old version is deprecated)
# time_str = now.strftime("%H_%M")
# output = os.path.join(outdir, 'geo.query.cell_type.{}_{}.tsv'.format(date_str, time_str))
output = os.path.join(outdir, 'geo.query.cell_type.{}.tsv'.format(date_str))

print("output file: ", output)
cell_type_df.to_csv(output, index=False, sep='\t')

output file:  ../../results/geo_celltypes/geo.query.cell_type.2024_03_07.tsv


In [35]:
cell_type_df.loc[cell_type_df.name.str.startswith('THP-1')]

Unnamed: 0,name,organism,biomaterial,disease,organ,tissue,celltype,strain,sex,age,race,extdb_name,extdb_uuid,geo_id,sra_id,biosample_id
482,THP-1 CTCF HiChIP CTL N1,Homo sapiens,Cell Line,Leukemia,"Blood, Immune-associated",Undetermined,Monocyte,,Undetermined,Undetermined,,,,GSM5678428,SRS10979744,SAMN22978778
483,THP-1 CTCF HiChIP CTL N2,Homo sapiens,Cell Line,Leukemia,"Blood, Immune-associated",Undetermined,Monocyte,,Undetermined,Undetermined,,,,GSM5678429,SRS10979746,SAMN22978786
484,THP-1 CTCF HiChIP CTL N3,Homo sapiens,Cell Line,Leukemia,"Blood, Immune-associated",Undetermined,Monocyte,,Undetermined,Undetermined,,,,GSM5678430,SRS10979748,SAMN22978790
485,"THP-1 CTCF HiChIP 1,25D N1",Homo sapiens,Cell Line,Leukemia,"Blood, Immune-associated",Undetermined,Monocyte,,Undetermined,Undetermined,,,,GSM5678431,SRS10979750,SAMN22978791
486,"THP-1 CTCF HiChIP 1,25D N2",Homo sapiens,Cell Line,Leukemia,"Blood, Immune-associated",Undetermined,Monocyte,,Undetermined,Undetermined,,,,GSM5678432,SRS10979752,SAMN22978792
487,"THP-1 CTCF HiChIP 1,25D N3",Homo sapiens,Cell Line,Leukemia,"Blood, Immune-associated",Undetermined,Monocyte,,Undetermined,Undetermined,,,,GSM5678433,SRS10979754,SAMN22978793
