In [2]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException # To catch driver-level errors
from selenium.common.exceptions import WebDriverException, TimeoutException as SeleniumTimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import time
import os
import re
from bs4 import BeautifulSoup, Tag, NavigableString
from fake_useragent import UserAgent
import pandas as pd 
from urllib.parse import urljoin, urlparse, parse_qs, urlencode 

In [3]:
ua = UserAgent()

In [6]:
SUBJECTS_LIST =['https://www.cell.com/heliyon/ear-nose-throat',
 'https://www.cell.com/heliyon/earth-science',
 'https://www.cell.com/heliyon/economics',
 'https://www.cell.com/heliyon/education',
 'https://www.cell.com/heliyon/emergency-medicine',
 'https://www.cell.com/heliyon/endocrinology',
 'https://www.cell.com/heliyon/energy',
 'https://www.cell.com/heliyon/engineering',
 'https://www.cell.com/heliyon/environment',
 'https://www.cell.com/heliyon/evolution-ecology',
 'https://www.cell.com/heliyon/finance',
 'https://www.cell.com/heliyon/food-science-and-nutrition',
 'https://www.cell.com/heliyon/gastroenterology',
 'https://www.cell.com/heliyon/genetics',
 'https://www.cell.com/heliyon/haematology',
 'https://www.cell.com/heliyon/hepatology',
 'https://www.cell.com/heliyon/immunology',
 'https://www.cell.com/heliyon/infectious-diseases']
PAGE_SIZE = 100
BASE_URL = 'https://www.cell.com'

In [5]:
def initialize_uc_driver(headless=False, chrome_version_major=134, page_load_timeout_seconds=60): # Added timeout
    print("DEBUG: Initializing driver...")
    options = uc.ChromeOptions()
    if headless:
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = uc.Chrome(options=options, version_main=chrome_version_major, use_subprocess=True)
    try:
        if driver: # Set timeout only if driver was initialized
            driver.set_page_load_timeout(page_load_timeout_seconds)
            print(f"DEBUG: Driver initialized. Page load timeout set to {page_load_timeout_seconds}s.")
    except Exception as e:
        print(f"CRITICAL: Driver initialization failed: {e}")
    return driver


In [7]:
def build_subject_page_url(base_subject_url, page_number, page_size):
    parsed_url = urlparse(base_subject_url)
    query_params = parse_qs(parsed_url.query)

        
    query_params['startPage'] = [str(page_number)]
    query_params['pageSize'] = [str(page_size)]

    # Rebuild url
    new_query_string = urlencode(query_params, doseq=True)
    return parsed_url._replace(query=new_query_string).geturl()

    

In [9]:
def get_article_hrefs_from_page(page_html_source):
    soup = BeautifulSoup(page_html_source, "lxml")
    main_content = soup.find(class_='rlist search-result__body items-results items-results--articles')
    href_list = []
    abs_href_for_append = None 

    title_h4_tags = main_content.find_all(class_='meta__title')

    for h4_tag in title_h4_tags:
        link_tag = h4_tag.find('a', class_='full-title-multi-search')
        if link_tag and link_tag.has_attr('href'):
            relative_href = link_tag.get('href')
            if relative_href:
                abs_href_for_append = urljoin(BASE_URL, relative_href)
                href_list.append(abs_href_for_append)
        #     else:
        #         print(f"    Found <a> tag with empty href: {h4_tag.get_text(strip=True)}")
        # else:
        #     print(f"    No valid <a> tag found in h4: {h4_tag.get_text(strip=True)}")
    return href_list



In [10]:
def get_total_pages(page_html_source):
    soup = BeautifulSoup(page_html_source, "lxml")
    pagination_info_tag = soup.find(class_='current-page') 
    if pagination_info_tag:
        text = pagination_info_tag.get_text(strip=True) 
        match = re.search(r'of\s*(\d+)', text, re.IGNORECASE)
        if match:
            try:
                total_pages = int(match.group(1))
                return total_pages
            except ValueError:
                print(f"    Warning: Could not parse total pages from text: '{text}'")

    else:
        print("    Warning: Pagination info tag (class 'current-page') not found. Assuming 1 page.")
        main_content_check = soup.find(class_='rlist search-result__body items-results items-results--articles')
        if main_content_check and main_content_check.find(class_='meta__title'):
             return 1
        else:
             return 0 

    return 1



In [45]:
driver = initialize_uc_driver(headless=False)

KeyboardInterrupt: 

In [39]:
all_collected_article_hrefs = {}
for subject_base_url in SUBJECTS_LIST[0:2]:
    subject_name = subject_base_url.split('/')[-1] 
    all_collected_article_hrefs[subject_name] = []
    current_subject_hrefs = []  
    page_zero_url = build_subject_page_url(subject_base_url, 0, PAGE_SIZE)
    print(f"  Navigating to initial page: {page_zero_url}")

    driver.get(page_zero_url)
    time.sleep(2) 
    page_html = driver.page_source

    hrefs_on_page_zero = get_article_hrefs_from_page(page_html)
    current_subject_hrefs.extend(hrefs_on_page_zero)
    print(f"    Scraped {len(hrefs_on_page_zero)} hrefs from page 0.")

    total_pages_for_subject = get_total_pages(page_html)
    print(f"    Detected {total_pages_for_subject} total pages for this subject.")


    for page_num in range(1, total_pages_for_subject):
        print(f"  Processing page {page_num + 1}/{total_pages_for_subject} (startPage={page_num})")
        next_page_url = build_subject_page_url(subject_base_url, page_num, PAGE_SIZE)
        print(f"    Navigating to: {next_page_url}")

        driver.get(next_page_url)
        time.sleep(2)

        page_html_next = driver.page_source

        hrefs_on_next_page = get_article_hrefs_from_page(page_html_next)
        current_subject_hrefs.extend(hrefs_on_next_page)
        print(f"      Scraped {len(hrefs_on_next_page)} hrefs.")
        time.sleep(1) # Politeness delay

    all_collected_article_hrefs[subject_name] = list(set(current_subject_hrefs)) # Store unique hrefs
    print(f"  Finished subject {subject_name}. Total unique hrefs collected: {len(all_collected_article_hrefs[subject_name])}")

  Navigating to initial page: https://www.cell.com/heliyon/ear-nose-throat?ContentItemType=fla&startPage=0&pageSize=100
    Scraped 27 hrefs from page 0.
    Detected 1 total pages for this subject.
  Finished subject ear-nose-throat. Total unique hrefs collected: 27
  Navigating to initial page: https://www.cell.com/heliyon/earth-science?ContentItemType=fla&startPage=0&pageSize=100
    Scraped 100 hrefs from page 0.
    Detected 7 total pages for this subject.
  Processing page 2/7 (startPage=1)
    Navigating to: https://www.cell.com/heliyon/earth-science?ContentItemType=fla&startPage=1&pageSize=100
      Scraped 100 hrefs.
  Processing page 3/7 (startPage=2)
    Navigating to: https://www.cell.com/heliyon/earth-science?ContentItemType=fla&startPage=2&pageSize=100
      Scraped 100 hrefs.
  Processing page 4/7 (startPage=3)
    Navigating to: https://www.cell.com/heliyon/earth-science?ContentItemType=fla&startPage=3&pageSize=100
      Scraped 100 hrefs.
  Processing page 5/7 (startPa

In [40]:
all_collected_article_hrefs

{'ear-nose-throat': ['https://www.cell.com/heliyon/fulltext/S2405-8440(24)17424-5',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(24)10131-4',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(23)10909-1',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(22)01878-3',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(22)02029-1',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(25)01046-1',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(23)05063-6',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(22)00243-2',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(23)06130-3',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(22)00008-1',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(24)14813-X',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(24)11768-9',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(24)05701-3',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(22)00005-6',
  'https://www.cell.com/heliyon/fulltext/S2405-8440(22)03258-3',
  'htt

In [46]:
def run_pipeline():
    driver = initialize_uc_driver(headless=False)


    all_collected_article_hrefs = {}
    for subject_base_url in SUBJECTS_LIST:
        subject_name = subject_base_url.split('/')[-1] 
        all_collected_article_hrefs[subject_name] = []
        current_subject_hrefs = []  
        page_zero_url = build_subject_page_url(subject_base_url, 0, PAGE_SIZE)
        print(f"  Navigating to initial page: {page_zero_url}")

        driver.get(page_zero_url)
        time.sleep(2) 
        page_html = driver.page_source

        hrefs_on_page_zero = get_article_hrefs_from_page(page_html)
        current_subject_hrefs.extend(hrefs_on_page_zero)
        print(f"    Scraped {len(hrefs_on_page_zero)} hrefs from page 0.")

        total_pages_for_subject = get_total_pages(page_html)
        print(f"    Detected {total_pages_for_subject} total pages for this subject.")


        for page_num in range(1, total_pages_for_subject):
            print(f"  Processing page {page_num + 1}/{total_pages_for_subject} (startPage={page_num})")
            next_page_url = build_subject_page_url(subject_base_url, page_num, PAGE_SIZE)
            print(f"    Navigating to: {next_page_url}")

            driver.get(next_page_url)
            time.sleep(2)

            page_html_next = driver.page_source

            hrefs_on_next_page = get_article_hrefs_from_page(page_html_next)
            current_subject_hrefs.extend(hrefs_on_next_page)
            print(f"      Scraped {len(hrefs_on_next_page)} hrefs.")
            time.sleep(1) 

        all_collected_article_hrefs[subject_name] = list(set(current_subject_hrefs)) # Store unique hrefs
        print(f"  Finished subject {subject_name}. Total unique hrefs collected: {len(all_collected_article_hrefs[subject_name])}")

    driver.quit()
    print("\nWebDriver closed.")

    return all_collected_article_hrefs

if __name__ == '__main__':
    ariticle_links = run_pipeline()

  Navigating to initial page: https://www.cell.com/heliyon/ear-nose-throat?startPage=0&pageSize=100
    Scraped 34 hrefs from page 0.
    Detected 1 total pages for this subject.
  Finished subject ear-nose-throat. Total unique hrefs collected: 34
  Navigating to initial page: https://www.cell.com/heliyon/earth-science?startPage=0&pageSize=100
    Scraped 100 hrefs from page 0.
    Detected 7 total pages for this subject.
  Processing page 2/7 (startPage=1)
    Navigating to: https://www.cell.com/heliyon/earth-science?startPage=1&pageSize=100
      Scraped 100 hrefs.
  Processing page 3/7 (startPage=2)
    Navigating to: https://www.cell.com/heliyon/earth-science?startPage=2&pageSize=100
      Scraped 100 hrefs.
  Processing page 4/7 (startPage=3)
    Navigating to: https://www.cell.com/heliyon/earth-science?startPage=3&pageSize=100
      Scraped 100 hrefs.
  Processing page 5/7 (startPage=4)
    Navigating to: https://www.cell.com/heliyon/earth-science?startPage=4&pageSize=100
      S

In [None]:
import json
output_file = "heliyon.json"

try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(ariticle_links, f, ensure_ascii=False, indent=4)
    print(f"Dictionary saved to '{output_file}'")
except IOError as e:
    print(f"Error saving dictionary to JSON file: {e}")

Dictionary saved to 'heliyon.json'


# Main subjects scraper

In [51]:
driver = uc.Chrome()

SessionNotCreatedException: Message: session not created: cannot connect to chrome at 127.0.0.1:33461
from session not created: This version of ChromeDriver only supports Chrome version 136
Current browser version is 134.0.6998.165
Stacktrace:
#0 0x5ae93134471a <unknown>
#1 0x5ae930de70a0 <unknown>
#2 0x5ae930e27a47 <unknown>
#3 0x5ae930e26935 <unknown>
#4 0x5ae930e1c373 <unknown>
#5 0x5ae930e6cd75 <unknown>
#6 0x5ae930e6c296 <unknown>
#7 0x5ae930e5e173 <unknown>
#8 0x5ae930e2ad4b <unknown>
#9 0x5ae930e2b9b1 <unknown>
#10 0x5ae9313098cb <unknown>
#11 0x5ae93130d7ca <unknown>
#12 0x5ae9312f1622 <unknown>
#13 0x5ae93130e354 <unknown>
#14 0x5ae9312d645f <unknown>
#15 0x5ae9313324f8 <unknown>
#16 0x5ae9313326d6 <unknown>
#17 0x5ae931343586 <unknown>
#18 0x7714cff6a7eb <unknown>
#19 0x7714cffee18c <unknown>


In [31]:
page_html_2 = driver.page_source

In [33]:
soup = BeautifulSoup(page_html_2, "lxml")

In [35]:
subjects = soup.find(class_='subjList col4')

In [45]:
subjects_li = subjects.find_all("li")

In [47]:
len(subjects_li)

71

In [48]:
subjects_li

[<li><a href="/heliyon/agriculture">Agriculture</a></li>,
 <li><a href="/heliyon/animal-science">Animal Science</a></li>,
 <li><a href="/heliyon/applied-psychology">Applied Psychology</a></li>,
 <li><a href="/heliyon/biochem-mol-cell-bio">Biochemistry, Molecular and Cell Biology</a></li>,
 <li><a href="/heliyon/bioinformatics-computational-biology">Bioinformatics and Computational Biology</a></li>,
 <li><a href="/heliyon/business-management">Business and Management</a></li>,
 <li><a href="/heliyon/cancer-research">Cancer Research</a></li>,
 <li><a href="/heliyon/cardiology-cardiovascular-medicine">Cardiology and Cardiovascular Medicine</a></li>,
 <li><a href="/heliyon/chemical-engineering">Chemical Engineering</a></li>,
 <li><a href="/heliyon/chemistry">Chemistry</a></li>,
 <li><a href="/heliyon/civil-engineering">Civil Engineering</a></li>,
 <li><a href="/heliyon/clinical-psychology-psychiatry">Clinical Psychology and Psychiatry</a></li>,
 <li><a href="/heliyon/clinical-research">Clin

In [49]:
main_subjects_links = []
base_url = 'https://www.cell.com/'
for subject in subjects_li:
    link_subject = subject.find('a')
    if link_subject and link_subject.has_attr('href'):
        relative_href = link_subject['href'] 
        
        abs_href = urljoin(base_url,relative_href)
    
    main_subjects_links.append(abs_href)


In [None]:
my_subjects_links = main_subjects_links[17:35]

['https://www.cell.com/heliyon/ear-nose-throat',
 'https://www.cell.com/heliyon/earth-science',
 'https://www.cell.com/heliyon/economics',
 'https://www.cell.com/heliyon/education',
 'https://www.cell.com/heliyon/emergency-medicine',
 'https://www.cell.com/heliyon/endocrinology',
 'https://www.cell.com/heliyon/energy',
 'https://www.cell.com/heliyon/engineering',
 'https://www.cell.com/heliyon/environment',
 'https://www.cell.com/heliyon/evolution-ecology',
 'https://www.cell.com/heliyon/finance',
 'https://www.cell.com/heliyon/food-science-and-nutrition',
 'https://www.cell.com/heliyon/gastroenterology',
 'https://www.cell.com/heliyon/genetics',
 'https://www.cell.com/heliyon/haematology',
 'https://www.cell.com/heliyon/hepatology',
 'https://www.cell.com/heliyon/immunology',
 'https://www.cell.com/heliyon/infectious-diseases']

## Single article scrape abstract + kwords

In [52]:
driver =  initialize_uc_driver(headless=False)

In [53]:
driver.get('https://www.cell.com/heliyon/fulltext/S2405-8440(22)02029-1')

In [54]:
page_html = driver.page_source

In [None]:
soup = BeautifulSoup(page_html, 'lxml')

In [57]:
abstract = soup.find(id='abspara0010')

In [75]:
abstract.text

'Granulomatous formation in the nose and paranasal sinuses still presents an unmet clinical challenge as it affects both the physical health and personality of patients, and the lack of a systematic diagnostic and disease management approach has further complicated the scenario. Occurrence of granulomatous nasal disorder in the rural Indian population is associated with several factors such as lack of proper medical care, lack of hygienic and clean working and living conditions, and limited financial abilities to access the already overburdened primary healthcare system. This study aims to understand the correlation disease incidence, manifestation of signs and symptoms and associated socio-epidemiological parameters for 104,000 patients over a period of 23 months in Odisha. Primarily the study used socio-epidemiological surveys collected, annotated, and curated independently for granulomatous nasal disorder patients and compared this with their clinical records for signs & symptoms an

In [68]:
kw = soup.find(id = 'keywords')

In [70]:
li_kw = kw.find_all('li')

In [73]:
for tag in li_kw:
    print(tag.text)

Granulomatous
Nose
Paranasal sinuses
Males
Rural area
Poor living condition
Rhinosporidiosis
Fungal granuloma
Rhinoscleroma
Tuberculosis
Leprosy


In [13]:
import json
input_file = "heliyon.json"  # Replace with your file name

with open(input_file, 'r', encoding='utf-8') as f:
    data_dict = json.load(f)

In [15]:
# --- Function to Scrape Article Details ---
def scrape_article_details_with_uc_and_bs4(article_url):

    print(f"  Navigating to article: {article_url}")
    article_data = {
        "url": article_url,
        "title": "N/A",
        "abstract": "N/A",
        "keywords": [] 
    }

    try:
        driver.get(article_url)
        page_html = driver.page_source
        # time.sleep(1) 
        soup = BeautifulSoup(page_html, 'lxml') 

        # --- Extract Title ---
        title_tag = soup.find('h1',  attrs={'property': 'name'})
        if not title_tag:
            print('FUCK THERE IS NO TITLE')
        article_data['title'] = title_tag.text.strip()

        # --- Extract Abstract ---
        abstract_text = "N/A"
        abstract_element = soup.find(id='abspara0010')
        if not abstract_element or 'withdrawn' in abstract_element.text:
            print("NO ABSTRACT, HELP!!!!@@@!!")
        else:
            abstract_text = abstract_element.get_text(strip=True)
        article_data['abstract'] = abstract_text if abstract_text else "N/A"


        # --- Extract Keywords ---
        keywords_list = []
        # Try your specific ID first
        keywords_section = soup.find(id='keywords')
        if not keywords_section:
            keywords_section = soup.find(id ='index terms')
        if keywords_section:
            # Check for <li> items
            li_items = keywords_section.find_all('li')
            if li_items:
                for li in li_items:
                    kw_text = li.get_text(strip=True)
                    if kw_text:
                        keywords_list.append(kw_text)



        article_data['keywords'] = keywords_list if keywords_list else []

        print(f"    Title: {article_data['title'][:60]}...")
        print(f"    Abstract found: {True if article_data['abstract'] not in ['N/A', ''] else False} (Length: {len(article_data['abstract'])})")
        print(f"    Keywords found: {True if article_data['keywords'] else False} (Count: {len(article_data['keywords'])}) -> {article_data['keywords'][:5]}")



    except SeleniumTimeoutException: # This is caught if driver.get() exceeds the page_load_timeout
        print(f"  !!! SeleniumTimeoutException (Page Load Timeout) for {article_url}")
        article_data['title'] = "ERROR_PAGE_LOAD_TIMEOUT"
    except WebDriverException as e:
        print(f"  !!! WebDriverException for {article_url}: {e}")
        article_data['title'] = f"ERROR_WEBDRIVER_EXCEPTION: {type(e).__name__}"
    except Exception as e: # Catch other errors during parsing
        print(f"  An error occurred while processing {article_url} (parsing): {e}")
        if not article_data['title'].startswith("ERROR"):
            article_data['title'] = f"ERROR_PARSING: {type(e).__name__}"
    return article_data



In [34]:
driver = initialize_uc_driver(headless=False)

In [None]:
for i, article_url in enumerate(article_url_list):
    overall_processed_article_index += 1

In [12]:
flat_url_list = []
for subject, urls in data_dict.items():
    for url in urls:
        flat_url_list.append({'subject': subject, 'url': url})


NameError: name 'data_dict' is not defined

In [None]:
START_INDEX_FOR_RESUME = 0


In [None]:
all_extracted_data = [] # To store results from all articles
for subject, article_url_list in data_dict.items():
    print(f"\nProcessing Subject: {subject}")
    for i, article_url in enumerate(article_url_list):
        print(f" Scraping article {i+1}/{len(article_url_list)} for subject '{subject}'")
        details = scrape_article_details_with_uc_and_bs4(article_url)
        details['subject'] = subject # Add subject info to the record
        all_extracted_data.append(details)



Processing Subject: ear-nose-throat
 Scraping article 1/34 for subject 'ear-nose-throat'
  Navigating to article: https://www.cell.com/heliyon/fulltext/S2405-8440(24)17424-5
    Title: Establishment of reference audiometric norms for the elderly...
    Abstract found: True (Length: 164)
    Keywords found: True (Count: 10) -> ['Audiometry', 'Pure-tone', 'Hearing loss', 'Age-related', 'Hearing threshold']
 Scraping article 2/34 for subject 'ear-nose-throat'
  Navigating to article: https://www.cell.com/heliyon/fulltext/S2405-8440(24)10131-4
    Title: Correlation of Apo B/A1 ratio with hemodynamics and hearing ...
    Abstract found: True (Length: 189)
    Keywords found: True (Count: 5) -> ['Elderly sudden sensorineural hearing loss', 'Apolipoprotein B/A1 ratio', 'Hemodynamics', 'Degree of hearing impairment', 'Correlation']
 Scraping article 3/34 for subject 'ear-nose-throat'
  Navigating to article: https://www.cell.com/heliyon/fulltext/S2405-8440(23)10909-1
    Title: Tracheobronch

KeyboardInterrupt: 

In [16]:
OUTPUT_CSV_FILE = "heliyon_flatlist_time_retry.csv"
PAGE_LOAD_TIMEOUT_SECONDS = 30
URL_PROCESSING_WATCHDOG_SECONDS = 70 # Max time for one URL's scrape_article_details... call


if __name__ == "__main__":
    driver = initialize_uc_driver(headless=False, page_load_timeout_seconds=PAGE_LOAD_TIMEOUT_SECONDS)
    if not driver:
        print("Initial driver failed. Exiting.")
        exit()

    all_collected_data_this_run = []

    # --- Flatten the dictionary as you requested ---
    flat_url_list = []
    for subject, urls_for_subject in data_dict.items():
        for u in urls_for_subject:
            flat_url_list.append({'subject': subject, 'url': u})
    # --- End of flattening ---

    START_INDEX_FOR_RESUME = 3000

    current_flat_list_index = START_INDEX_FOR_RESUME
    while current_flat_list_index < len(flat_url_list):
        item_info = flat_url_list[current_flat_list_index]
        current_subject = item_info['subject']
        current_article_url = item_info['url']

        print(f"\n--- Attempting URL {current_flat_list_index + 1}/{len(flat_url_list)} ({current_subject}) ---")
        
        processing_start_time = time.time()
        details_dict = None
        driver_considered_ok = True

        try:
            details_dict = scrape_article_details_with_uc_and_bs4(current_article_url)
        except Exception as e_scraper_crash:
            print(f"  !!! UNEXPECTED CRASH IN scrape_article_details for {current_article_url}: {e_scraper_crash} !!!")
            details_dict = {"url": current_article_url, "title": f"ERROR_SCRAPER_CRASH: {type(e_scraper_crash).__name__}",
                            "abstract": "N/A", "keywords": [] }
            driver_considered_ok = False

        if details_dict:
            details_dict['subject'] = current_subject
            all_collected_data_this_run.append(details_dict)

        # --- Save to CSV after every 3000 items ---
        if len(all_collected_data_this_run) % 3000 == 0 and len(all_collected_data_this_run) > 0:
            df = pd.DataFrame(all_collected_data_this_run)
            csv_filename = f"heliyon_scraped_{current_flat_list_index+1}_rows.csv"
            df.to_csv(csv_filename, index=False, encoding='utf-8')
            print(f"\nSaved {len(all_collected_data_this_run)} items to '{csv_filename}'")

        processing_time_for_url = time.time() - processing_start_time

        if not driver_considered_ok or processing_time_for_url > URL_PROCESSING_WATCHDOG_SECONDS:
            if not driver_considered_ok:
                print(f"\n!!! Driver flagged as not OK for URL {current_article_url} (Result: {details_dict.get('title', 'N/A')}). Restarting. !!!")
            else:
                print(f"\n!!! WATCHDOG: URL {current_article_url} processing took {processing_time_for_url:.1f}s (>{URL_PROCESSING_WATCHDOG_SECONDS}s). Restarting driver. !!!")
            
            if driver:
                try: driver.quit()
                except: pass
            
            driver = initialize_uc_driver(headless=False, page_load_timeout_seconds=PAGE_LOAD_TIMEOUT_SECONDS)
            if not driver:
                print("CRITICAL: Failed to restart driver. Saving collected data and exiting.")
                break

            print("Driver restarted. Will re-try the current URL.")
            continue

        current_flat_list_index += 1
        time.sleep(0.1)


DEBUG: Initializing driver...
DEBUG: Driver initialized. Page load timeout set to 30s.

--- Attempting URL 3001/12212 (energy) ---
  Navigating to article: https://www.cell.com/heliyon/fulltext/S2405-8440(23)01248-3
    Title: Optimization of anaerobic digestion parameters for biogas pr...
    Abstract found: True (Length: 1460)
    Keywords found: False (Count: 0) -> []

--- Attempting URL 3002/12212 (energy) ---
  Navigating to article: https://www.cell.com/heliyon/fulltext/S2405-8440(24)07150-0
    Title: Optimizing energy storage plant discrete system dynamics ana...
    Abstract found: True (Length: 1876)
    Keywords found: True (Count: 4) -> ['GCN', 'Packet switching', 'Temporal depth-separated convolutional modules', 'Energy storage plants']

--- Attempting URL 3003/12212 (energy) ---
  Navigating to article: https://www.cell.com/heliyon/fulltext/S2405-8440(24)12693-X
    Title: COVID-19 impact on wind and solar energy sector and cost of ...
    Abstract found: True (Length: 14

In [18]:
df = pd.DataFrame(all_collected_data_this_run)
csv_filename = f"heliyon_scraped.csv"
df.to_csv(csv_filename, index=False, encoding='utf-8')
print(f"\nSaved {len(all_collected_data_this_run)} items to '{csv_filename}'")



Saved 9212 items to 'heliyon_scraped.csv'


In [20]:
len(all_collected_data_this_run)

9212