In [1]:
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import re

In [262]:
def get_all_gse_urls(search_result_url, max_results=1e5):
        
    # initialize the Firefox WebDriver
    service = Service()
    driver = webdriver.Firefox(service=service)
    driver.get(search_result_url)
    

    # wait for the page to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "a[href^='/gds/']"))
    )
        
    # list to store all result URLs
    result_urls = []
    
    # function to extract URLs from the current page
    def extract_urls():
        soup = BeautifulSoup(driver.page_source, "html.parser")
        urls = []
        for link in soup.select("a[href^='/gds/']"):
            href = link.get("href")
            if href and href.startswith("/gds/"):
                full_url = "https://www.ncbi.nlm.nih.gov" + href
                urls.append(full_url)
    
        return [x for x in urls if 'GSE' in x]
    
    # extract URLs from the first page
    result_urls.extend(extract_urls())
    
    # loop through all pages
    while True:
        try:
            # stop if we've reached the total number of results (1141)
            if len(result_urls) >= max_results:
                break
                
            # find and click the "Next" button
            next_button = driver.find_element(By.CSS_SELECTOR, "a.next")
            next_button.click()
    
            # wait for the next page to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a[href^='/gds/']"))
            )
    
            # extract URLs from the current page
            page_urls = extract_urls()
            if not page_urls:
                print("No more results found.")
                break
    
            page_urls = [x for x in page_urls if 'GSE' in x]
            result_urls.extend(page_urls)
        
            # Add a small delay to avoid overloading the server
            time.sleep(1)
    
        except Exception as e:
            print(f"All pages were scaned: {e}")
            break
    
    # Close the browser
    driver.quit()

    gse_urls = [base_url+f"/geo/query/acc.cgi?acc={x.split('=')[1].split('[')[0]}" for x in result_urls if '[' in x]
    print(len(gse_urls))
    print(len(set(gse_urls)))

    return list(set(gse_urls))


def get_sar_sel_url(gse_url, base_url="https://www.ncbi.nlm.nih.gov"):
    response = requests.get(gse_url)

    soup = BeautifulSoup(response.text, 'html.parser')
    res = soup.find_all('a', string=re.compile('SRA Run Selector'))
    if res:
        row = res[0]

        sra_sel_url = base_url + str(row).split('"')[1]
        return sra_sel_url


def get_srr_ids(srr_url):
    driver = webdriver.Firefox()
    driver.get(srr_url)
    if 0: 
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "ph-rs-pager")),
            EC.presence_of_element_located((By.ID, "ph-rs-table"))
        )
        
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//table[contains(@class, 'table')]//tbody"))
        )
    if 1:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, f"//tr[1]//td[contains(@class, 'is_s')]//div//a"))
        )

    i = 0
    srr_ids = []
    while True:
        i += 1
        try: 
            srr = driver.find_element(By.XPATH, f"//tr[{i}]//td[contains(@class, 'is_s')]//div//a").text
            srr_ids.append(srr)
            
        except Exception as e:
            print(f"Done retriving all SRR IDs")
            driver.quit()
            break
    return srr_ids

In [264]:
base_url = "https://www.ncbi.nlm.nih.gov"

In [265]:
#search_result_url="https://www.ncbi.nlm.nih.gov/gds/?term=human+single-cell+RNA-seq+10x"
search_result_url="https://www.ncbi.nlm.nih.gov/gds/?term=(human+single-cell+RNA-seq+10x)+AND+%22Homo+sapiens%22%5Bporgn%3A__txid9606%5D"

### Step 1: Get a list of urls to GSE series pages (Search results -> GSE 'Accession Display' pages)
gse_urls = get_all_gse_urls(search_result_url, max_results=1)

20
20


In [267]:
### Step 2: Get a list of SRA urls (GSE 'Accession Display' pages -> 'SRA Run Selector' pages)
# loop through all GSE urls
sra_urls = []
for gse in gse_urls[0:4]:
    sra_url = get_sar_sel_url(gse_url=gse)
    sra_urls.append(sra_url)

In [271]:
[x for x in sra_urls if x is not None]

['https://www.ncbi.nlm.nih.gov/Traces/study/?acc=PRJNA1204337',
 'https://www.ncbi.nlm.nih.gov/Traces/study/?acc=PRJNA1143054',
 'https://www.ncbi.nlm.nih.gov/Traces/study/?acc=PRJNA988514']

In [270]:
### Step 3: Get a list of SRR IDs ('SRA Run Selector' pages -> SRR IDs)

# Open the SRA Run Selector page
sra_url = 'https://www.ncbi.nlm.nih.gov/Traces/study/?acc=PRJNA1204337'
get_srr_ids(sra_url)

Done retriving all SRR IDs


['SRR31856351', 'SRR31856352']