In [1]:
import os 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import pandas as pd
import numpy as np
from lxml import etree
from Bio import Entrez

# set a dummy email 
Entrez.email = "zjiang@lji.org"



## Query GEO BioSample using the GSM IDs

In [10]:
# open file and read the content in a list
gsm_ids = []
with open('gsm_list.txt', 'r') as fp:
    gsm_ids = [x.strip() for x in fp.readlines()]


In [12]:
gsm_filters = '[All Fields] OR '.join(gsm_ids)
gsm_query = Entrez.esearch(db="biosample", retmax=10000, term=gsm_filters)
gsm_result = Entrez.read(gsm_query)

### Filter query result by looping through XML

#### Definte a dictionary of organs and their synynoms for classification

In [13]:
# order the classifcations from general to specific, so that as the code goes
# down the ordered dict, general classifcations are replaced with specific ones
class_dict = {
    "biomaterial": {
        "cell line": ["cellline", "cell line", "soxgfp", "cell line established from", "hESC"], # pay attention to the last two
        "primary": ["primary", "primi", "attribute_name=\"patient\""]
    },
    "disease": {
        "HCC": [">hcc<"],
        "GIST": ["gist"],
        "Colorectal Adenocarcinoma": ["cw2", ">rko", "sw403", "sw480", "sw620", "sw837", "sw948", "dld1", "hct15", "snu-c5", "hcc116", "ls174t", "colon cancer", "colorectal cancer", "coloO320-dm"],
        "Leukemia": ["leukemia"],
        "Acute myeloid leukemia": ["mll-af9"],
        "Acute monocytic leukemia": ["thp-1"],
        "Acute lymphoblastic leukaemia": [">sem"],
        "Mantle cell lymphoma": ["mantle cell lymphoma"],
        "ALL": [">all<"],
        "Burkitt's lymphoma": ["ramos b"],
        "Non-Hodgkin lymphoma": ["oci-ly7", "sudh4"],
        "Prostate carcinoma": ["prostate cancer", "lncap", "vcap"],
        "Normal": [">fhc"],
        "Gastrointestinal stromal tumour": ["gist"],
        "Mycosis fungoides": ["my-la", "myla"],
        "Breast cancer": ["breast cancer", "t47d"],
        "Invasive Breast Carcinoma": ["hcc1599"],
        "Triple-negative breast cancer": ["triple-negative breast cancer"],
        "ER-positive, PR-positive, HER2-negative breast cancer": ["mcf7"],
        "ER-positive breast cancer": ["mda_mb134vi"],
        "ER-positive, PR-negative and ERBB2-negative breast cancer": ["sum44pe"],
        "Esophageal squamous cancer": ["esophageal squamous cancer"],
        "Non-Small Cell Lung Cancer": [">hcc15<"],
        "Lung squamous cancer": ["lung squamous cancer"],
        "Small cell lung cancer": ["small cell lung cancer"],
        "Head and neck squamous cancer": ["head and neck squamous cancer"],
        "Human endometrial adenocarcinoma": ["human endometrial adenocarcinoma"],
        "Ewing sarcoma": ["ewing sarcoma"],
        "Multiple myeloma": ["multiple myeloma"],
        "Neuroblastoma": ["neuroblastoma", "tr-14", "tr14"],
        "Gastric carcinoma": ["snu16", "snu-16"],
        "Rhabdoid tumor": ["g401"],
        "Alveolar rhabdomyosarcoma": [">rh4<", ">rh3<", ">rh30<"],
        "Embryonal rhabdomyosarcoma": [">sms"],
    },
    # QUESTION: what do I do with ARMS, which has RH3, 30, 4, 41 metasized to different organs
     # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3713458/?
    "organ": {
        "embryo": ["embryo", "mesc", "hesc", "\">e14", "\">esc"],
        "skeletal system": ["bone", "marrow", "ewing sarcoma", "t-all", "myeloma", "plb", "tc71", "\">rh30", "pre-b"],
        "blood": ["vessel", "leukemia", "blood", "ramos b", "haematopoietic", "b-all", "\">sem", "jurkat", "cutll1", "gm12878", "oci-ly7", "thp-1", "hudep1", "hudep-1", "hudep2", "hudep-2", "hudep3", "hudep-3"], # GSM6206927 bone or blood? # Hudep is from umbellical cord blood
        "bodily fluid": ["pleural", "effusion", "sudh4"],
        "skin": ["skin", "epi", "squamous", "breast", "melanoma", "erythematosus", "sle", "keratinocyte", "melanocyte", "nasopharyngeal"], # sle might cause problem
        "mammary gland": ["mammar", "breast", "mda-mb-231"],
        "thyroid": ["thyroid gland", "thyroid"],
        "thymus": ["thymus", "thymic", "treg", "t-reg", "my-la", "myla"], # GSM5680696 thyroid or epithelium
        "lymph node": ["lymph node", "lymphoma", "\">rh3\""], # RH3 is derived from metastatic site, lymph node
        "tonsil": ["tonsil"],
        "liver": ["liver", "hcc"],
        "musculature of body": ["muscula", "soft", "connective", "rhabdomyosarcoma", "angiomatoid fibrous histiocytoma", "\">sms", "a673"],
        "lung": ["lung", "\">rh4"],
        "esophagus": ["esophagus", "esophageal"], # GSM5680726 esophagus or epithelium
        "intestine": ["indestin", "fhc", "sw480"],
        "kidney": ["kidney", "podocyte", "g401"],
        "colon": ["colon", "colorectal", "hct116", "ls174t", ">rko", "coloO320-dm"],
        "spleen": ["spleen"],
        "stomach": ["stomach", "gist", "snu16", "snu-16"],
        "eye": ["retina"],
        "placenta": ["placenta"],
        "pancreas": ["pancrea"],
        "bladder": ["bladder"],
        "uturus": ["umbilical", "uterus", "endometrial", "cervic", "hela"],
        "penis": ["penis"],
        "testis": ["test"],
        "ovary": ["ovar"],
        "nerve": ["neuroblastoma", "nerve", "spinal", "cortex", "\">npc", "tr-14", "tr14"],
        "brain": ["brain", "caudate", "chla-10"],
        "prostate": ["prostate"],
        "heart": ["heart", "cardi", "ventricle", "atrium", "aortic"]
    }
}

In [14]:
# 50 s per 100 GSMs
# Initialize an empty dataframe with the desired column names
cell_type_df = pd.DataFrame(columns=['name', 'organism', 'biomaterial', 'disease',
                                     'organ', 'tissue', 'celltype', 'strain',
                                     'sex', 'age', 'extdb_name', 'extdb_uuid', 'other'])



In [6]:
gsm_id_list = sorted(set(gsm_result["IdList"]))

# Define the base URL for the efetch command
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'

root = None
intervals = 100
for start in range(0, len(gsm_id_list), intervals):

    end = start + intervals
    print(start, end)

    # define the parameters for the efetch command
    params = {
        'db': 'biosample',
        'id': ','.join(gsm_id_list[start:end]),
        'retmode': 'xml',
        'rettype': 'DocumentSummarySet'
    }

    # send the request to the E-utilities API and get the response
    response = requests.get(base_url, params=params)

    if root == None:
        root = etree.fromstring(response.content)
    else:
        root2 = etree.fromstring(response.content)
        root.extend(root2)


0 100
100 200
200 300
300 400
400 500
500 600
600 700
700 800
800 900
900 1000
1000 1100
1100 1200
1200 1300
1300 1400
1400 1500
1500 1600
1600 1700
1700 1800
1800 1900
1900 2000
2000 2100
2100 2200
2200 2300
2300 2400
2400 2500
2500 2600
2600 2700
2700 2800
2800 2900
2900 3000
3000 3100
3100 3200
3200 3300
3300 3400
3400 3500
3500 3600
3600 3700
3700 3800
3800 3900
3900 4000
4000 4100
4100 4200
4200 4300
4300 4400
4400 4500
4500 4600
4600 4700
4700 4800
4800 4900
4900 5000
5000 5100
5100 5200
5200 5300
5300 5400
5400 5500
5500 5600
5600 5700
5700 5800
5800 5900
5900 6000
6000 6100
6100 6200
6200 6300
6300 6400
6400 6500
6500 6600
6600 6700
6700 6800
6800 6900
6900 7000
7000 7100
7100 7200
7200 7300
7300 7400
7400 7500
7500 7600
7600 7700
7700 7800
7800 7900
7900 8000
8000 8100
8100 8200
8200 8300
8300 8400
8400 8500
8500 8600
8600 8700
8700 8800
8800 8900
8900 9000
9000 9100
9100 9200


In [16]:
num_of_gsm_done = 0

# Iterate through the list of GSM IDs
for doc_sum in root.findall(".//DocumentSummary"):

    sample_data = doc_sum.find('.//SampleData')

    gsm_data = {
        "name":"",
        "organism":"",
        "biomaterial":"",
        "disease": "",
        "organ":"",
        "tissue":"",
        "celltype":"",
        "strain":"",
        "sex":"",
        "age":"",
        "extdb_name":"",
        "extdb_uuid":"",
        "other":""
        }
    celltype_infos = []
    
    # Classify organ and biomaterial by checking synonyms in the SampleData XML text
    # check GSM4956217 and GSM4447223
    data_str = sample_data.text
    for key in class_dict:
        for classification in class_dict[key]:
            synonyms = class_dict[key][classification]
            found = any(syn in data_str.lower() for syn in synonyms)

            # edge cases
            if "\">RD" in data_str:
                gsm_data["organ"] = "musculature of body"

            if "RH3" in data_str:
                gsm_data["organ"] = "exocrine gland"

            if classification == "liver": # sometimes hcc is actually breast, like hcc1599 and hcc19** cell lines
                if "breast" in data_str.lower():
                    gsm_data["organ"] = "breast"

            if found:
                gsm_data[dic] = "|" + classification
                
        gsm_data[dic] = gsm_data[dic].lstrip("|")
    
    
    for elem in sample_data:
        for subelem in elem:

            if ('db\': \'GEO' in str(subelem.attrib)):
                gsm_data["name"] = subelem.text

            if "taxonomy_name" in str(subelem.attrib):
                gsm_data["organism"] = subelem.attrib["taxonomy_name"]

            if ('disease' in str(subelem.attrib) or 'cancer type' in str(subelem.attrib)):
                gsm_data["disease"] = subelem.text # many organ info are from here

            try:
                if (gsm_data["disease"] == "" and \
                    ("oma" in str(sample_data.find("./Attributes/Attribute[@attribute_name='tissue']").text).lower() or \
                    "tumor" in str(sample_data.find("./Attributes/Attribute[@attribute_name='tissue']").text).lower())
                   ):
                    gsm_data["disease"] = sample_data.find("./Attributes/Attribute[@attribute_name='tissue']").text
            except:
                gsm_data["disease"] = gsm_data["disease"] + ""

            if (gsm_data["disease"] == "" and \
                ("tumor" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "tumour" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "normal" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower())
               ):
                    gsm_data["disease"] = sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text
            if ('organ' in str(subelem.attrib)): # no organ attributes at all
                gsm_data["organ"] = subelem.text
            if ('tissue' in str(subelem.attrib) or \
                'brain region' in str(subelem.attrib)
               ):
                gsm_data["tissue"] = subelem.text  # most organ info are from here # can have disease info
            if ('cell_line' in str(subelem.attrib) or \
                'biological sample' in str(subelem.attrib) or \
                'neuronal subtype' in str(subelem.attrib) or \
                '\'cell' in str(subelem.attrib)
               ):
                if ((not str(subelem.text).startswith("BRD")) and (not str(subelem.text).startswith("MGH"))): # these are gene names
                    celltype_infos.append(str(subelem.attrib['attribute_name']) + "-->" + subelem.text)

            if ('parental cell line' in str(subelem.attrib)):
                celltype_infos.append(str(subelem.attrib['attribute_name']) + "-->" + subelem.text)
                gsm_data["biomaterial"] = "cell line"

            if ("mll-" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "bl6" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "hct" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "vcap" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "h9" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                "cell line" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() or \
                ("rim" not in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower() and \
                 "cyte" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower())
               ):
                celltype_infos.append("{source_name}" + "-->" + str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text))
                gsm_data["biomaterial"] = "cell line"

            if ('cell_line' in str(subelem.attrib)):
                if (subelem.text != "--"):
                    gsm_data["biomaterial"] = "cell line"

            if ('strain' in str(subelem.attrib) or 'mouse model' in str(subelem.attrib)):
                gsm_data["strain"] = subelem.text

            if ('p14' in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower()):
                gsm_data["strain"] = str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text)

            if ('gender' in str(subelem.attrib) or \
                '\'sex\'' in str(subelem.attrib)):
                gsm_data["sex"] = subelem.text

            if ("day" in str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text).lower()):
                gsm_data["age"] = str(sample_data.find("./Attributes/Attribute[@attribute_name='source_name']").text)

            if ('\'age\'' in str(subelem.attrib) or \
                'age_day' in str(subelem.attrib)):
                gsm_data["age"] = subelem.text

            if ('\'time' in str(subelem.attrib) or \
                '\'differentiation' in str(subelem.attrib) or \
                'differentiation\'' in str(subelem.attrib) or \
                'developmental stage' in str(subelem.attrib) or \
                ('erythropoiesis' in str(subelem.attrib) and 'no' not in str(subelem.attrib))):
                if gsm_data["age"] == "":
                    gsm_data["age"] = subelem.text
            
    gsm_data["celltype"] = '|'.join(set(celltype_infos))

    other = sample_data.find("./Attributes/Attribute[@attribute_name='source_name']")
    if other is not None:
        gsm_data["other"] = "{source_name}" + "-->" + str(other.text)
    else:
        gsm_data["other"] = ""
    
    cell_type_df = cell_type_df.append(gsm_data, ignore_index=True)
    
    # indicate how many GSM are processed and save the table as the loop runs
    num_of_gsm_done += 1
    if (num_of_gsm_done % 100 == 0):
            print("finished GSM number", num_of_gsm_done)

finished GSM number 100
finished GSM number 200
finished GSM number 300
finished GSM number 400
finished GSM number 500
finished GSM number 600
finished GSM number 700
finished GSM number 800
finished GSM number 900
finished GSM number 1000
finished GSM number 1100
finished GSM number 1200
finished GSM number 1300
finished GSM number 1400
finished GSM number 1500
finished GSM number 1600
finished GSM number 1700
finished GSM number 1800
finished GSM number 1900
finished GSM number 2000
finished GSM number 2100
finished GSM number 2200
finished GSM number 2300
finished GSM number 2400
finished GSM number 2500
finished GSM number 2600
finished GSM number 2700
finished GSM number 2800
finished GSM number 2900
finished GSM number 3000
finished GSM number 3100
finished GSM number 3200
finished GSM number 3300
finished GSM number 3400
finished GSM number 3500
finished GSM number 3600
finished GSM number 3700
finished GSM number 3800
finished GSM number 3900
finished GSM number 4000
finished 

In [17]:
cell_type_df

Unnamed: 0,name,organism,biomaterial,disease,organ,tissue,celltype,strain,sex,age,extdb_name,extdb_uuid,other
0,,,,,primary,,,,,,,,
1,,,,,primary,,,,,,,,
2,,,,,primary,,,,,,,,
3,,,,,primary,,,,,,,,
4,,,,,embryo,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9164,,,,,embryo,,,,,,,,
9165,,,,,embryo,,,,,,,,
9166,,,,,embryo,,,,,,,,
9167,,,,,embryo,,,,,,,,


### Save

In [None]:
# setting the output filename
today = date.today()
date_str = today.strftime("%Y_%m_%d")
now = datetime.now()
time_str = now.strftime("%H_%M")
output = "GEO_Query_cell_type.{}_{}".format(date_str, time_str)
print("output file: ", output)
cell_type_df.to_excel(output + ".xlsx", index=False)

output file:  GEO_Query_cell_type.2022_12_23_18_16


Sample ID: This is the unique identifier for each sample in the database. You can search for samples by their ID using this field.

Organism: This field allows you to search for samples from a specific organism, such as human, mouse, or yeast.

Tissue: This field allows you to search for samples based on their tissue type, such as liver, brain, or heart.

Disease state: This field allows you to search for samples based on their disease state, such as cancer, diabetes, or autoimmune disease.

Cell type: This field allows you to search for samples based on their cell type, such as fibroblasts, macrophages, or stem cells.

Developmental stage: This field allows you to search for samples based on their developmental stage, such as embryonic, fetal, or adult.

Experimental design: This field allows you to search for samples based on the type of experiment they were used in, such as microarray, RNA-seq, or ChIP-seq.