In [1]:
import argparse
from Bio import Entrez

import os 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from datetime import date
from datetime import datetime
import numpy as np
import re
from metapub import FindIt
from metapub.convert import pmid2doi
from metapub.convert import doi2pmid
from metapub import PubMedFetcher

import xml.etree.ElementTree as ET
import ssl

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import argparse
from Bio import Entrez

ssl._create_default_https_context = ssl._create_unverified_context



## Query the NCBI database

In [2]:
# use one of the following search queries/filters
# filters = "HiChIP[All Fields] AND \"gse\"[Filter]" # the whole notebook will take around 7 min to run
# filters = "HiChIP[All Fields] AND (\"gse\"[Filter] AND (\"Homo sapiens\"[Organism] OR \"Mus musculus\"[Organism]))
# filters = "HiChIP[All Fields] AND (\"Homo sapiens\"[Organism] AND \"gse\"[Filter])"
# filters = "HiChIP[All Fields] AND (\"Homo sapiens\"[Organism] AND \"published last year\"[Filter]) AND \"gse\"[Filter]" # 2 min
# filters = "HiChIP[All Fields] AND (\"Mus musculus\"[Organism] AND \"gse\"[Filter])"

filters = "HiChIP[All Fields] AND (\"gse\"[Filter] AND (\"Homo sapiens\"[Organism] OR \"Mus musculus\"[Organism]))"

# set a dummy email 
Entrez.email = "zjiang@lji.org"

# query the NCBI database 
search_result = Entrez.esearch(db="gds", retmax=10000, term=filters)
result = Entrez.read(search_result)

# creating a regex to extract pubmed IDs
PubMedIds_pattern = re.compile('[0123456789]+')

In [3]:
# parse the results into a dataframe 
dictionary_lst = []
for Id in result["IdList"]:
    handle = Entrez.esummary(db="gds", id=Id, retmode="xml") # get summary of this entry on GEO datasets
    entry = Entrez.parse(handle)
    for column in entry:
        dictionary_lst.append(column)
df = pd.DataFrame(dictionary_lst)

In [4]:
df.head()

Unnamed: 0,Item,Id,Accession,GDS,title,summary,GPL,GSE,taxon,entryType,...,ExtRelations,n_samples,SeriesTitle,PlatformTitle,PlatformTaxa,SamplesTaxa,PubMedIds,Projects,FTPLink,GEO2R
0,[],200205218,GSE205218,,Auxin-inducible degron 2 system deciphers func...,This SuperSeries is composed of the SubSeries ...,24676;18573,205218,Homo sapiens,GSE,...,[],54,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE205nn...,no
1,[],200201911,GSE201911,,Auxin-inducible degron 2 system deciphers func...,Background: CTCF is a well-established chromat...,18573,201911,Homo sapiens,GSE,...,[],4,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE201nn...,no
2,[],200207828,GSE207828,,Oct1 recruits the histone lysine demethylase U...,The pathways used by cells to transition betwe...,24247,207828,Mus musculus,GSE,...,[],4,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE207nn...,yes
3,[],200160943,GSE160943,,Oct1 recruits the histone lysine demethylase U...,This SuperSeries is composed of the SubSeries ...,24247,160943,Mus musculus,GSE,...,[],32,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE160nn...,yes
4,[],200123645,GSE123645,,Super-enhancer driven MYST family histone lysi...,Nasopharyngeal carcinoma (NPC) is causally lin...,15228;18573,123645,synthetic construct; Homo sapiens,GSE,...,"[{'RelationType': 'SRA', 'TargetObject': 'SRP1...",9,,,,,[],[],ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE123nn...,yes


In [5]:
df.shape

(238, 29)

## Cleaning the Paper Names and Converting from PMID to DOI

In [6]:
dictionary_lst = []
for Id in result["IdList"]:
    
    handle = Entrez.esummary(db="gds", id=Id, retmode="xml")
    entry = Entrez.parse(handle)
    
    for column in entry:
        paper_title = column['title']
        new_paper_title = ''
        last_word = re.split('\s|[.]', paper_title)[-1]
        last_two_words = re.split('\s|[.]', paper_title)[-2:]
        
        # remove ".[HiChIP]", ". [HiChIP]", or "(HiChIP)" in paper titles using steps below
        if ('[' in last_word and ']' in last_word) or \
                ('(' in last_word and ')' in last_word):
            new_paper_title = re.split('\s|[.]', paper_title)[0:-1]
            new_paper_title = ' '.join(new_paper_title)
            new_paper_title = new_paper_title.strip()
            if new_paper_title.endswith('.'): # remove the last period in paper title
                new_paper_title = new_paper_title[:-1]
                
        # remove ".[Hi ChIP]", ". [Hi ChIP]", or "[Bead Array]" in paper titles using steps below
        elif ('[' in last_two_words[0] and ']' in last_two_words[1]) or \
                    ('(' in last_two_words[0] and ')' in last_two_words[1]):
            new_paper_title = re.split('\s|[.]', paper_title)[0:-2]
            new_paper_title = ' '.join(new_paper_title)
            new_paper_title = new_paper_title.strip()
            if new_paper_title.endswith('.'): # remove the last period in paper title
                new_paper_title = new_paper_title[:-1]
                
        elif paper_title.endswith('.'):
            new_paper_title = paper_title[:-1] 
            
        else:
            new_paper_title = paper_title
            
        new_paper_title = new_paper_title.replace("\xa0", " ") # remove no-break space
        column['title'] = new_paper_title
        
        # convert PMID to DOI, some have no PMID so empty string
        string = str(column['PubMedIds'])
        pdf_url = ""
        PubMedIds = PubMedIds_pattern.findall(string)
        
        if len(PubMedIds) > 0:
            DOI = pmid2doi(PubMedIds[0])
            column['PubMedIds'] = DOI
        else: column['PubMedIds'] = ""
        dictionary_lst.append(column)
        
df = pd.DataFrame(dictionary_lst)

## Reformat the Columns for Google Sheet Compatibility

In [7]:
# drop extra columns
df = pd.DataFrame(dictionary_lst)

drop_cols = ['Item', 'Id', 'GDS', 'GPL', 'GSE', 'entryType', 'ptechType',
             'valType', 'SSInfo', 'subsetInfo', 'suppFile', 'Relations', 'ExtRelations',
             'n_samples', 'SeriesTitle', 'PlatformTitle', 'PlatformTaxa', 'SamplesTaxa',
             'Projects', 'FTPLink', 'GEO2R']
df.drop(drop_cols, inplace=True, axis=1)

# rename columns
rename_cols = {"Accession":"GEO / Data link", "PubMedIds":"DOI", "title":"Paper Title",
               "taxon":"Organism", "gdsType":"Any other information", "PDAT":"Year",
               "Samples":"Other matched data"}
df.rename(columns=rename_cols, inplace=True)

# extract just the year
df["Year"] = df["Year"].str[:4]

# add index for merging
df['index'] = np.arange(len(df))

# fill na with empty values
df.fillna("",inplace=True)

# adding missing columns with empty values
df["Journal"] = ""
df["Authors"] = ""
df["Tissue/Cell Line"] = ""
df["Presenter"] = ""
df["Potential HiChIP"] = ""

## Assigning Potential HiChIP Samples

Filter out samples with no HiChIP mention and assigned Yes or Maybe to all others 

In [8]:
col = df["Other matched data"]
lst = []
GSM_IDs = [] ######## new for cell type
index = 0
for row in col:
    
    temp = []
    state = False
    
    # Check for the term HiChIP in the title
    for ele in row: 
        if ("HiChIP".casefold() in ele["Title"].casefold()) or \
                ("Hi-ChIP".casefold() in ele["Title"].casefold()):
            state = True
    
    # keep samples with HiChIP and mark "Yes"
    if state == True: 
        for ele in row:
            GSM_IDs.append(ele["Accession"]) ######## new for cell type
            if ("HiChIP".casefold() in ele["Title"].casefold()) or \
                    ("Hi-ChIP".casefold() in ele["Title"].casefold()):
                temp.append(ele["Accession"] + ": " + ele["Title"])
        temp_str = "\n".join(temp)
        df.at[index, "Potential HiChIP"] = "Yes"

    # otherwise, keep all GSM samples and mark "Maybe"
    else: 
        for ele in row:
            GSM_IDs.append(ele["Accession"]) ######## new for cell type
            temp.append(ele["Accession"] + ": " + ele["Title"])
        temp_str = "\n".join(temp)
        df.at[index,"Potential HiChIP"]="Maybe"
    lst.append(temp_str)
    index += 1
    
df["Other matched data"] = list(lst)

## Merge Rows with the Same Paper Title

In [9]:
aggregation_functions = {'Paper Title':'first', 'DOI':'max', 'Journal':'first', 'Authors':'first', 'Year':'first', 'GEO / Data link':lambda x: '\n'.join(x),
       'Any other information':lambda x: '\n'.join(x), 'Organism':'first', 'Tissue/Cell Line':'first',
       'Potential HiChIP':'first', 'Other matched data':lambda x: '\n'.join(x), 'Presenter':'first'}
df_grouped = df.groupby(df['Paper Title']).aggregate(aggregation_functions)

## Fetch Journal Name and First Author of Each Paper and Add https to DOI

In [10]:
jounrnal_lst = []
authors_lst = []
fetch = PubMedFetcher()
for DOI in df_grouped['DOI']:
    try:
        PMID = doi2pmid(DOI)
        article = fetch.article_by_pmid(PMID)
        jounrnal_lst.append(article.journal)
        authors_lst.append(article.authors[0].split()[0] + " et al.")
    except:
        jounrnal_lst.append("")
        authors_lst.append("")
df_grouped['Journal'] = jounrnal_lst
df_grouped['Authors'] = authors_lst

In [11]:
# add https address to DOI (for easy access)
df_grouped.loc[(df_grouped['DOI'] != ''), 'DOI'] = 'https://doi.org/' + df_grouped.loc[(df_grouped['DOI'] != ''), 'DOI']

## Note the Date Added, Set Output Path, and Save the Final Output File

In [12]:
# make an output directory
######## new for cell type
outdir = ''
# outdir = 'results/hichip_db/'
# os.makedirs(outdir, exist_ok=True)

In [13]:
# determining the current year, month and day
today = date.today()
date_str = today.strftime("%Y_%m_%d")

# determining current hour and minute
now = datetime.now()
time_str = now.strftime("%H_%M")

# setting the output filename
output = os.path.join(outdir, "GEO_Query.{}_{}".format(date_str, time_str))

In [14]:
# add a column that notes the added date
df_grouped['Date Added'] = date_str.replace('_', '-')
reorder = ['Paper Title', 'Journal', 'Authors', 'Year', 'DOI', 'GEO / Data link', 'Any other information',
           'Organism', 'Tissue/Cell Line', 'Potential HiChIP', 'Other matched data', 'Presenter', 'Date Added']
df_grouped = df_grouped.loc[:, reorder]

In [15]:
df_grouped.to_excel(output+".xlsx", index=False)

## Save the GSM IDs line by line

In [16]:
GSM_IDs = list(set(GSM_IDs))
with open(r'gsm_list.txt', 'w') as fp:
    for gsm in GSM_IDs:
        # write each item on a new line
        fp.write("%s\n" % gsm)

## Query GEO BioSample using the GSM IDs

In [20]:
GSM_IDs_from_file = []
# open file and read the content in a list
with open(r'gsm_list.txt', 'r') as fp:
    for line in fp:
        # remove linebreak from a current name
        # linebreak is the last character of each line
        gsm = line[:-1]

        # add current item to the list
        GSM_IDs_from_file.append(gsm)

GSM_filters = ' [All Fields] OR '.join(GSM_IDs_from_file[0:1000])
GSM_filters

'GSM5066603 [All Fields] OR GSM4027536 [All Fields] OR GSM5269989 [All Fields] OR GSM2572636 [All Fields] OR GSM3059347 [All Fields] OR GSM5029775 [All Fields] OR GSM3103921 [All Fields] OR GSM3930325 [All Fields] OR GSM4214149 [All Fields] OR GSM5533509 [All Fields] OR GSM5269818 [All Fields] OR GSM5270760 [All Fields] OR GSM4513989 [All Fields] OR GSM6528198 [All Fields] OR GSM5746056 [All Fields] OR GSM4613476 [All Fields] OR GSM3263143 [All Fields] OR GSM5746007 [All Fields] OR GSM3376557 [All Fields] OR GSM2572625 [All Fields] OR GSM4214139 [All Fields] OR GSM4333041 [All Fields] OR GSM2816631 [All Fields] OR GSM4441813 [All Fields] OR GSM4551849 [All Fields] OR GSM4061152 [All Fields] OR GSM5066600 [All Fields] OR GSM4836043 [All Fields] OR GSM3426275 [All Fields] OR GSM6318728 [All Fields] OR GSM3229488 [All Fields] OR GSM3018513 [All Fields] OR GSM6528183 [All Fields] OR GSM5777101 [All Fields] OR GSM4851920 [All Fields] OR GSM5270747 [All Fields] OR GSM5533469 [All Fields] OR 

In [21]:
Entrez.email = "zjiang@lji.org"

search_result = Entrez.esearch(db="biosample", retmax=10000, term=GSM_filters)
result = Entrez.read(search_result)

### Filter query result by looping through XML

In [None]:
name
organism
biomaterial
disease
organ
celltype
sex
age
extdb_name
extdb_uuid

In [22]:
cell_type_df = pd.DataFrame(columns=['GEO', 'organism', 'biomaterial', 'disease', 'organ',
                                     'tissue', 'celltype', 'strain', 'sex', 'age',
                                     'genotype', 'treatment', 'antibody', 'extdb_name',
                                     'extdb_uuid', 'other'])

for Id in result["IdList"]:
    handle = Entrez.esummary(db="biosample", id=Id, retmode="xml") # get summary of this entry on GEO datasets
    entry = Entrez.read(handle)
    record = entry["DocumentSummarySet"]["DocumentSummary"][0]["SampleData"]
    myroot = ET.fromstring(record)
    gsm_data = {
        "GEO":"",
        "organism":"",
        "biomaterial":"",
        "disease": "",
        "organ":"",
        "tissue":"",
        "celltype":"",
        "strain":"",
        "sex":"",
        "age":"",
        "genotype":"",
        "treatment":"",
        "antibody":"",
        "extdb_name":"",
        "extdb_uuid":"",
        "other":[]
        }

    for elem in myroot:
        for subelem in elem:
            if ('db\': \'GEO' in str(subelem.attrib)):
                gsm_data["GEO"] = subelem.text
            if "taxonomy_name" in str(subelem.attrib):
                gsm_data["organism"] = subelem.attrib["taxonomy_name"]
            if ('source_name' in str(subelem.attrib)):
                gsm_data["biomaterial"] = subelem.text
            if ('cell_type' in str(subelem.attrib)):
                gsm_data["tissue"] = subelem.text
            if ('cell_line' in str(subelem.attrib)):
                gsm_data["celltype"] = subelem.text
            if ('strain' in str(subelem.attrib)):
                gsm_data["strain"] = subelem.text
            if ('age' in str(subelem.attrib)):
                gsm_data["age"] = subelem.text
            if ('genotype' in str(subelem.attrib)):
                gsm_data["genotype"] = subelem.text
            if ('treatment' in str(subelem.attrib)):
                gsm_data["treatment"] = subelem.text
            if ('antibody' in str(subelem.attrib)):
                gsm_data["antibody"] = subelem.text
            if (len(str(subelem.attrib)) > 2 and \
                'url' not in str(subelem.attrib) and \
                'BioSample' not in str(subelem.attrib) and \
                'SRA' not in str(subelem.attrib) and \
                'bioproject' not in str(subelem.attrib) and \
                'db\': \'GEO' not in str(subelem.attrib) and \
                'taxonomy_name' not in str(subelem.attrib) and \
                'source_name' not in str(subelem.attrib) and \
                'cell_type' not in str(subelem.attrib) and \
                'cell_line' not in str(subelem.attrib) and \
                'strain' not in str(subelem.attrib) and \
                'age' not in str(subelem.attrib) and \
                'genotype' not in str(subelem.attrib) and \
                'treatment' not in str(subelem.attrib) and \
                'antibody' not in str(subelem.attrib)
               ):
                gsm_data["other"].append(str(subelem.attrib) + "-->" + subelem.text)
    gsm_data["other"] = '|'.join(gsm_data["other"])
    cell_type_df = cell_type_df.append(gsm_data, ignore_index=True)
    
    
# 	GSM4027555 and more
#     <Attribute attribute_name="type">Tumor</Attribute>
# 		<Attribute attribute_name="relapse">No</Attribute>
# 		<Attribute attribute_name="gender" harmonized_name="sex" display_name="sex">male</Attribute>
# 		<Attribute attribute_name="age_days">2338</Attribute> or timepoint
# 		<Attribute attribute_name="Stage">4</Attribute>

# GSM6528200 and more
# <Attribute attribute_name="disease state" harmonized_name="disease" display_name="disease">HCC</Attribute>
# <Attribute attribute_name="tissue" harmonized_name="tissue" display_name="tissue">tumour</Attribute>

# GSM3930313 and more
# {'attribute_name': 'biological sample'}-->FHC should replace celltype

SyntaxError: invalid syntax (2181387067.py, line 16)

In [None]:
# cell_type_df = pd.DataFrame(columns=["GEO", 'organism', 'biomaterial', 'tissue', 'celltype', 'strain', 'age', 'genotype', 'treatment', 'antibody', 'other'])

# for Id in result["IdList"]:
#     handle = Entrez.esummary(db="biosample", id=Id, retmode="xml") # get summary of this entry on GEO datasets
#     entry = Entrez.read(handle)
#     record = entry["DocumentSummarySet"]["DocumentSummary"][0]["SampleData"]
#     myroot = ET.fromstring(record)
#     gsm_data = {
#         "GEO":"",
#         "organism":"",
#         "biomaterial":"",
#         "tissue":"",
#         "celltype":"",
#         "strain":"",
#         "age":"",
#         "genotype":"",
#         "treatment":"",
#         "antibody":"",
#         "other":[]
#         }

#     for elem in myroot:
#         for subelem in elem:
#             if ('db\': \'GEO' in str(subelem.attrib)):
#                 gsm_data["GEO"] = subelem.text
#             if "taxonomy_name" in str(subelem.attrib):
#                 gsm_data["organism"] = subelem.attrib["taxonomy_name"]
#             if ('source_name' in str(subelem.attrib)):
#                 gsm_data["biomaterial"] = subelem.text
#             if ('cell_type' in str(subelem.attrib)):
#                 gsm_data["tissue"] = subelem.text
#             if ('cell_line' in str(subelem.attrib)):
#                 gsm_data["celltype"] = subelem.text
#             if ('strain' in str(subelem.attrib)):
#                 gsm_data["strain"] = subelem.text
#             if ('age' in str(subelem.attrib)):
#                 gsm_data["age"] = subelem.text
#             if ('genotype' in str(subelem.attrib)):
#                 gsm_data["genotype"] = subelem.text
#             if ('treatment' in str(subelem.attrib)):
#                 gsm_data["treatment"] = subelem.text
#             if ('antibody' in str(subelem.attrib)):
#                 gsm_data["antibody"] = subelem.text
#             if (len(str(subelem.attrib)) > 2 and \
#                 'url' not in str(subelem.attrib) and \
#                 'BioSample' not in str(subelem.attrib) and \
#                 'SRA' not in str(subelem.attrib) and \
#                 'bioproject' not in str(subelem.attrib) and \
#                 'db\': \'GEO' not in str(subelem.attrib) and \
#                 'taxonomy_name' not in str(subelem.attrib) and \
#                 'source_name' not in str(subelem.attrib) and \
#                 'cell_type' not in str(subelem.attrib) and \
#                 'cell_line' not in str(subelem.attrib) and \
#                 'strain' not in str(subelem.attrib) and \
#                 'age' not in str(subelem.attrib) and \
#                 'genotype' not in str(subelem.attrib) and \
#                 'treatment' not in str(subelem.attrib) and \
#                 'antibody' not in str(subelem.attrib)
#                ):
#                 gsm_data["other"].append(str(subelem.attrib) + "-->" + subelem.text)
#     gsm_data["other"] = '|'.join(gsm_data["other"])
#     cell_type_df = cell_type_df.append(gsm_data, ignore_index=True)
    
    
# # 	GSM4027555 and more
# #     <Attribute attribute_name="type">Tumor</Attribute>
# # 		<Attribute attribute_name="relapse">No</Attribute>
# # 		<Attribute attribute_name="gender" harmonized_name="sex" display_name="sex">male</Attribute>
# # 		<Attribute attribute_name="age_days">2338</Attribute> or timepoint
# # 		<Attribute attribute_name="Stage">4</Attribute>

# # GSM6528200 and more
# # <Attribute attribute_name="disease state" harmonized_name="disease" display_name="disease">HCC</Attribute>
# # <Attribute attribute_name="tissue" harmonized_name="tissue" display_name="tissue">tumour</Attribute>

# # GSM3930313 and more
# # {'attribute_name': 'biological sample'}-->FHC should replace celltype

### Filter query result using root.find()

In [None]:
# root.find("./Ids/Id[@db='GEO']" too specific, some attributes have a slightly different synynom as name
# cell_type_df = pd.DataFrame(columns=["GEO", 'organism', 'biomaterial', 'tissue', 'celltype', 'strain', 'age', 'genotype', 'treatment', 'antibody', 'other'])

# for Id in result["IdList"]:
#     handle = Entrez.esummary(db="biosample", id=Id, retmode="xml") # get summary of this entry on GEO datasets
#     entry = Entrez.read(handle)
#     record = entry["DocumentSummarySet"]["DocumentSummary"][0]["SampleData"]
#     root = ET.fromstring(record)
#     gsm_data = {
#         "GEO":"",
#         "organism":"",
#         "biomaterial":"",
#         "tissue":"",
#         "celltype":"",
#         "strain":"",
#         "age":"",
#         "genotype":"",
#         "treatment":"",
#         "antibody":"",
#         "other":[]
#         }
    
#     try:
#         gsm_data["GEO"] = root.find("./Ids/Id[@db='GEO']").text
#     except:
#         gsm_data["GEO"] = ""
        
        
#     try:
#         gsm_data["organism"] = root.find("./Description/Organism/OrganismName").text
#     except:
#         gsm_data["organism"] = ""
        
#     try:
#         gsm_data["biomaterial"] = root.find("./Attributes/Attribute[@attribute_name='source_name']").text
#     except:
#         gsm_data["biomaterial"] = ""
        
#     try:
#         gsm_data["tissue"] = root.find("./Attributes/Attribute[@attribute_name='cell type']").text
#     except:
#         gsm_data["tissue"] = ""
        
        
#     try:
#         gsm_data["celltype"] = root.find("./Attributes/Attribute[@attribute_name='cell line']").text
#     except:
#         gsm_data["celltype"] = ""
        
#     try:
#         gsm_data["strain"] = root.find("./Attributes/Attribute[@attribute_name='strain']").text
#     except:
#         gsm_data["strain"] = ""
        
#     try:
#         gsm_data["age"] = root.find("./Attributes/Attribute[@attribute_name='age']").text
#     except:
#         gsm_data["age"] = ""
        
#     try:
#         gsm_data["genotype"] = root.find("./Attributes/Attribute[@attribute_name='genotype']").text
#     except:
#         gsm_data["genotype"] = ""
        
#     try:
#         gsm_data["treatment"] = root.find("./Attributes/Attribute[@attribute_name='treatment']").text
#     except:
#         gsm_data["treatment"] = ""
        
#     try:
#         gsm_data["antibody"] = root.find("./Attributes/Attribute[@attribute_name='chip antibody']").text
#     except:
#         gsm_data["antibody"] = ""
    
#     for elem in myroot:
#         for subelem in elem:
#             if (len(str(subelem.attrib)) > 2 and \
#                 'url' not in str(subelem.attrib) and \
#                 'BioSample' not in str(subelem.attrib) and \
#                 'SRA' not in str(subelem.attrib) and \
#                 'bioproject' not in str(subelem.attrib) and \
#                 'db\': \'GEO' not in str(subelem.attrib) and \
#                 'taxonomy_name' not in str(subelem.attrib) and \
#                 'source_name' not in str(subelem.attrib) and \
#                 'cell_type' not in str(subelem.attrib) and \
#                 'cell_line' not in str(subelem.attrib) and \
#                 'strain' not in str(subelem.attrib) and \
#                 'age' not in str(subelem.attrib) and \
#                 'genotype' not in str(subelem.attrib) and \
#                 'treatment' not in str(subelem.attrib) and \
#                 'antibody' not in str(subelem.attrib)
#                ):
#                 gsm_data["other"].append(str(subelem.attrib) + "-->" + subelem.text)
#     gsm_data["other"] = '|'.join(gsm_data["other"])
    
#     cell_type_df = cell_type_df.append(gsm_data, ignore_index=True)

### Save query result filtered table

In [None]:
cell_type_df.to_excel("test2.xlsx", index=False)

Sample ID: This is the unique identifier for each sample in the database. You can search for samples by their ID using this field.

Organism: This field allows you to search for samples from a specific organism, such as human, mouse, or yeast.

Tissue: This field allows you to search for samples based on their tissue type, such as liver, brain, or heart.

Disease state: This field allows you to search for samples based on their disease state, such as cancer, diabetes, or autoimmune disease.

Cell type: This field allows you to search for samples based on their cell type, such as fibroblasts, macrophages, or stem cells.

Developmental stage: This field allows you to search for samples based on their developmental stage, such as embryonic, fetal, or adult.

Experimental design: This field allows you to search for samples based on the type of experiment they were used in, such as microarray, RNA-seq, or ChIP-seq.

### GEO BioSample query result for one GSM

In [None]:
Entrez.email = "your_email@example.com"  # Provide your email address

# Retrieve all the BioSample records
handle = Entrez.esearch(db="biosample", term="GSM5029770")
records = Entrez.read(handle)

# Iterate through all the records and print the fields and values
for record in records["IdList"]:
    handle = Entrez.esummary(db="biosample", id=record)
    record = Entrez.read(handle)
    data = record["DocumentSummarySet"]["DocumentSummary"][0]["SampleData"]
    print(data, "\n")
    myroot = ET.fromstring(data)
    for elem in myroot:
        for subelem in elem:
            if "taxonomy_name" in str(subelem.attrib):
                print(subelem.attrib, "-->", subelem.attrib["taxonomy_name"])
            else:
                print(subelem.attrib, "-->", subelem.text)

### Dict of organs and their synynoms for classification

In [None]:
# organ names are based on ENCODE > data > experiment matrix and common organ names returned by ChatGPT
dic = {
    "blood": ["vessel"],
    "blodily fluid": [],
    "epithelium": ["skin", "epi"],
    "brain": [],
    "endocrine gland": ["thyroid gland"],
    "exocrine gland": ["lymph node"],
    "liver": [],
    "heart": [],
    "embryo": [],
    "connective tissue": [],
    "musculature of body": ["muscul"],
    "lung": [],
    "intestine": [],
    "kidney": [],
    "bone marrow": [],
    "colon": [],
    "spleen": [],
    "stomach": [],
    "placenta": [],
    "pancreas": [],
    "bladder": [],
    "reproductive organs": ["uterus", "penis", "ovar", "test"],
}
# nerves?