In [2]:
#Example pub med ids
pmids = [
    "30530648", "31820734", "31018141", "38539015", "33763704", "32572264", "31002671", 
    "33309739", "21057496", "27716510", "34059805", "34941412", "33879573", "35440059", 
    "33879573", "29462153", "29794063", "25939354", "30322904", "36879017", "35419551", 
    "31501549", "23042784", "22219169", "20670891", "20385583", "19723310", "20602769", 
    "26566685", "38030723", "30498128", "20485568", "21613409", "31076851", "37169753", 
    "39800688", "36510023", "38177678", "36539615", "37871105", "35235788", "27799057", 
    "30820472", "31666070", "34686734", "22384383", "34033742", "33589615", "31792364", 
    "29576475", "39762647", "32084358", "38977847", "25493933", "31125107", "26749252", 
    "39587714", "30333487", "33743111", "35172154", "32025611", "31136284", "26740022", 
    "37989753", "39838364", "39367016", "36650381", "35511946", "38641753", "38287646", 
    "36840360", "36544018", "36840360", "39637179", "35767948", "31801092", "38909241", 
    "36544018", "32384151", "26280576", "38379415", "29550329", "19211887", "36803569", 
    "30320226", "35920937", "37958987", "25340342", "37277533", "24223949"
]


In [3]:
import requests
from xml.etree import ElementTree

def fetch_geo_ids_bulk(pubmed_ids):
    #Using elink to get all the connected links
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
    
    joined_ids = ",".join(pubmed_ids)
    full_url = f"{base_url}?dbfrom=pubmed&db=gds&id={joined_ids}&retmode=xml"

    response = requests.get(full_url)

    geo_ids = []

    if response.status_code == 200:
        root = ElementTree.fromstring(response.content)

        for link in root.findall(".//LinkSetDb[DbTo='gds']/Link/Id"):
            geo_ids.append(link.text)
    else:
        print(f"Error: {response.status_code}")

    return geo_ids

In [15]:
geo_ids = fetch_geo_ids_bulk(pmids)

In [18]:
import re
def get_accession(text):
    match = re.search(r"Series\s+Accession:\s+(GSE\d+)", text)
    return match.group(1) if match else None

In [19]:
get_accession('''1. An atlas of small non-coding RNAs in Human Preimplantation Development [RNA-seq]
(Submitter supplied) Our current understanding of the molecular circuitries that govern early embryogenesis remains limited, particularly in the human. Small non-coding RNAs (sncRNAs) regulate gene expression transcriptionally and post-transcriptionally, however, the expression of specific biotypes and their dynamics during preimplantation development remains to be determined. Using Small-seq, we identified the abundance of and dynamic expression of piRNA, rRNA, snoRNA, tRNA, and miRNA in human embryos from day 3 to 7. more...
Organism:	Homo sapiens
Type:		Expression profiling by high throughput sequencing
Platform: GPL24676 103 Samples
FTP download: GEO (TSV) ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE249nnn/GSE249493/
Series		Accession: GSE249493	ID: 200249493''')

'GSE249493'

In [29]:
def extract_data(page_html):
    return ""

In [30]:
import time
def data_from_geoid(geo_ids):
    data = []
    for geo_id in geo_ids:
        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        full_url = f"{base_url}?db=gds&id={geo_id}&retmode=xml"

        response = requests.get(full_url)
        

        if response.status_code == 200:
            accession_code = get_accession(response.text)
            page_url = f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={accession_code}'

            page_html = requests.get(page_url)

            if response.status_code == 200:
                data.append(extract_data(page_html.text))
            else:
                print(f"Error: {response.status_code}")

        else:
            print(f"Error: {response.status_code}")
        time.sleep(0.5)

    return data

In [28]:
data_from_geoid(geo_ids)

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<HTML>
  <HEAD>
    
    <style type="text/css">
      a { text-decoration: none; }
	.bordergc {background-color: #6699CC;}
	.bordergd {background-color: #B6C7E5;}
	.borderge {background-color: #EEF3FB;}
	.bordergf {background-color: #FFFFFF;}
	.bordergg {background-color: #CCCCCC;}
      .small8b { font-size:8pt;
                font-family: ariel,helvetica,sans-serif;
                color:#6633cc;
              }
      .small8db { font-size:8pt;
                font-family: ariel,helvetica,sans-serif;
                color:#4411aa;
              }

    </style>
    <META http-equiv="Content-Type"
      content="text/html; charset=UTF-8">
    <META name="keywords"
      CONTENT="NCBI GEO Gene Expression Omnibus microarray oligonucleotide array SAGE">
    <META name="description"
      content="NCBI's Gene Expression Omnibus (GEO) is a public archive and resource for gene expression data.">

<meta name="ncbi_app" content=

KeyboardInterrupt: 