In [8]:
import os 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import requests

import pandas as pd
import numpy as np
from lxml import etree
from Bio import Entrez

import json

# set a dummy email 
Entrez.email = "zjiang@lji.org"

## Query GEO BioSample using the GSM IDs

In [9]:
# open file and read the content in a list
gsm_ids = []
with open('../geo_queries/gsm_list.txt', 'r') as fp:
    gsm_ids = [x.strip() for x in fp.readlines()]


In [10]:
gsm_filters = '[All Fields] OR '.join(gsm_ids)
gsm_query = Entrez.esearch(db="biosample", retmax=10000, term=gsm_filters)
gsm_result = Entrez.read(gsm_query)

### Filter query result by looping through XML

#### Define a dictionary of organs and their synynoms for classification

In [11]:
# order the classifcations from general to specific, so that as the code goes
# down the ordered dict, general classifcations are replaced with specific ones

In [12]:
with open('metadata_dictionary.json', 'r') as fp:
    class_dict = json.load(fp)

In [13]:
# Initialize an empty dataframe with the desired column names
cell_type_df = pd.DataFrame(columns=['name', 'organism', 'biomaterial', 'disease',
                                     'organ', 'tissue', 'celltype', 'strain',
                                     'sex', 'age', 'extdb_name', 'extdb_uuid', 'other'])



In [18]:
gsm_id_list = sorted(set(gsm_result["IdList"]))

# Define the base URL for the efetch command
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'

root = None
intervals = 100
for start in range(0, len(gsm_id_list), intervals):

    end = start + intervals
    print(start, end)

    # define the parameters for the efetch command
    params = {
        'db': 'biosample',
        'id': ','.join(gsm_id_list[start:end]),
        'retmode': 'xml',
        'rettype': 'DocumentSummarySet'
    }

    # send the request to the E-utilities API and get the response
    response = requests.get(base_url, params=params)

    if root == None:
        root = etree.fromstring(response.content)
    else:
        root2 = etree.fromstring(response.content)
        root.extend(root2)


0 100
100 200
200 300
300 400
400 500
500 600
600 700
700 800
800 900
900 1000
1000 1100
1100 1200


In [10]:
num_of_gsm_done = 0

# Iterate through the list of GSM IDs
for doc_sum in root.findall(".//DocumentSummary"):

    sample_data = doc_sum.find('.//SampleData')

    gsm_data = {
        'name': '',
        'organism': '',
        'biomaterial': '',
        'disease': '',
        'organ': '',
        'tissue': '',
        'celltype': '',
        'strain': '',
        'sex': '',
        'age': '',
        'extdb_name': '',
        'extdb_uuid': '',
        'other': ''}
    celltype_infos = []
    
    # Classify organ and biomaterial by checking synonyms in the SampleData XML text
    data_str = sample_data.text
    for key in class_dict:
        for classification in class_dict[key]:
            synonyms = class_dict[key][classification]
            found = any(syn in data_str.lower() for syn in synonyms)

            # edge cases
            if "\">RD" in data_str:
                gsm_data["organ"] = "musculature of body"

            if "RH3" in data_str:
                gsm_data["organ"] = "exocrine gland"

            if classification == "liver": # sometimes hcc is actually breast, like hcc1599 and hcc19** cell lines
                if "breast" in data_str.lower():
                    gsm_data["organ"] = "breast"

            if found:
                gsm_data[key] = "|" + classification
                
        gsm_data[key] = gsm_data[key].lstrip("|")
    
    for elem in sample_data:
        for subelem in elem:

            if ('db\': \'GEO' in str(subelem.attrib)):
                gsm_data["name"] = subelem.text

            if "taxonomy_name" in str(subelem.attrib):
                gsm_data["organism"] = subelem.attrib["taxonomy_name"]

            if ('disease' in str(subelem.attrib) or 'cancer type' in str(subelem.attrib)):
                gsm_data["disease"] = subelem.text # many organ info are from here

            try:
                if (gsm_data["disease"] == "" and \
                    ("oma" in str(sample_data.find("./Attributes/Attribute[@attribute_name='tissue']").text).lower() or \
                    "tumor" in str(sample_data.find("./Attributes/Attribute[@attribute_name='tissue']").text).lower())
                   ):
                    gsm_data["disease"] = sample_data.find("./Attributes/Attribute[@attribute_name='tissue']").text
            except:
                gsm_data["disease"] = gsm_data["disease"] + ""

            if (gsm_data["disease"] == "" and \
                ("tumor" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "tumour" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "normal" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower())
               ):
                    gsm_data["disease"] = sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text
            if ('organ' in str(subelem.attrib)): # no organ attributes at all
                gsm_data["organ"] = subelem.text
            if ('tissue' in str(subelem.attrib) or \
                'brain region' in str(subelem.attrib)
               ):
                gsm_data["tissue"] = subelem.text  # most organ info are from here # can have disease info
            if ('cell_line' in str(subelem.attrib) or \
                'biological sample' in str(subelem.attrib) or \
                'neuronal subtype' in str(subelem.attrib) or \
                '\'cell' in str(subelem.attrib)
               ):
                if ((not str(subelem.text).startswith("BRD")) and (not str(subelem.text).startswith("MGH"))): # these are gene names
                    celltype_infos.append(str(subelem.attrib['attribute_name']) + "-->" + subelem.text)

            if ('parental cell line' in str(subelem.attrib)):
                celltype_infos.append(str(subelem.attrib['attribute_name']) + "-->" + subelem.text)
                gsm_data["biomaterial"] = "cell line"

            if ("mll-" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "bl6" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "hct" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "vcap" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "h9" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "cell line" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                ("rim" not in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() and \
                 "cyte" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower())
               ):
                celltype_infos.append("{source_name}" + "-->" + str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text))
                gsm_data["biomaterial"] = "cell line"

            if ('cell_line' in str(subelem.attrib)):
                if (subelem.text != "--"):
                    gsm_data["biomaterial"] = "cell line"

            if ('strain' in str(subelem.attrib) or 'mouse model' in str(subelem.attrib)):
                gsm_data["strain"] = subelem.text

            if ('p14' in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower()):
                gsm_data["strain"] = str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text)

            if ('gender' in str(subelem.attrib) or \
                '\'sex\'' in str(subelem.attrib)):
                gsm_data["sex"] = subelem.text

            if ("day" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower()):
                gsm_data["age"] = str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text)

            if ('\'age\'' in str(subelem.attrib) or \
                'age_day' in str(subelem.attrib)):
                gsm_data["age"] = subelem.text

            if ('\'time' in str(subelem.attrib) or \
                '\'differentiation' in str(subelem.attrib) or \
                'differentiation\'' in str(subelem.attrib) or \
                'developmental stage' in str(subelem.attrib) or \
                ('erythropoiesis' in str(subelem.attrib) and 'no' not in str(subelem.attrib))):
                if gsm_data["age"] == "":
                    gsm_data["age"] = subelem.text
            
    gsm_data["celltype"] = '|'.join(set(celltype_infos))

    other = sample_data.find("./Attributes/Attribute[@attribute_name='source_name']")
    if other is not None:
        gsm_data["other"] = "{source_name}" + "-->" + str(other.text)
    else:
        gsm_data["other"] = ""
    
    cell_type_df = cell_type_df.append(gsm_data, ignore_index=True)
    
    # indicate how many GSM are processed and save the table as the loop runs
    num_of_gsm_done += 1
    if (num_of_gsm_done % 100 == 0):
            print("finished GSM number", num_of_gsm_done)

finished GSM number 100
finished GSM number 200
finished GSM number 300
finished GSM number 400
finished GSM number 500
finished GSM number 600
finished GSM number 700
finished GSM number 800
finished GSM number 900
finished GSM number 1000
finished GSM number 1100
finished GSM number 1200
finished GSM number 1300
finished GSM number 1400
finished GSM number 1500
finished GSM number 1600
finished GSM number 1700
finished GSM number 1800
finished GSM number 1900
finished GSM number 2000
finished GSM number 2100
finished GSM number 2200
finished GSM number 2300
finished GSM number 2400
finished GSM number 2500
finished GSM number 2600
finished GSM number 2700
finished GSM number 2800
finished GSM number 2900
finished GSM number 3000
finished GSM number 3100
finished GSM number 3200
finished GSM number 3300
finished GSM number 3400
finished GSM number 3500
finished GSM number 3600
finished GSM number 3700
finished GSM number 3800
finished GSM number 3900
finished GSM number 4000
finished 

In [11]:
cell_type_df

Unnamed: 0,name,organism,biomaterial,disease,organ,tissue,celltype,strain,sex,age,extdb_name,extdb_uuid,other
0,,,primary,,,,,,,,,,
1,,,primary,,,,,,,,,,
2,,,primary,,,,,,,,,,
3,,,primary,,,,,,,,,,
4,,,primary,,embryo,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9164,,,primary,,embryo,,,,,,,,
9165,,,primary,,embryo,,,,,,,,
9166,,,primary,,embryo,,,,,,,,
9167,,,primary,,embryo,,,,,,,,


### Save

In [None]:
# setting the output filename
today = date.today()
date_str = today.strftime("%Y_%m_%d")
now = datetime.now()
time_str = now.strftime("%H_%M")
output = "GEO_Query_cell_type.{}_{}".format(date_str, time_str)
print("output file: ", output)
cell_type_df.to_excel(output + ".xlsx", index=False)

output file:  GEO_Query_cell_type.2022_12_23_18_16


Sample ID: This is the unique identifier for each sample in the database. You can search for samples by their ID using this field.

Organism: This field allows you to search for samples from a specific organism, such as human, mouse, or yeast.

Tissue: This field allows you to search for samples based on their tissue type, such as liver, brain, or heart.

Disease state: This field allows you to search for samples based on their disease state, such as cancer, diabetes, or autoimmune disease.

Cell type: This field allows you to search for samples based on their cell type, such as fibroblasts, macrophages, or stem cells.

Developmental stage: This field allows you to search for samples based on their developmental stage, such as embryonic, fetal, or adult.

Experimental design: This field allows you to search for samples based on the type of experiment they were used in, such as microarray, RNA-seq, or ChIP-seq.