In [42]:
import time, random, csv, re
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from httpx import get
import numpy as np
from selectolax.parser import HTMLParser
import string

# REsearch scrapping

In [2]:
url1 = "https://pubmed.ncbi.nlm.nih.gov/?term=research&filter=pubt.classicalarticle&filter=pubt.clinicalstudy&filter=pubt.clinicaltrial&filter=pubt.news&filter=pubt.newspaperarticle&filter=years.2015-2025&sort=date"

driver = webdriver.Chrome()
driver.get(url1)

In [3]:
data = []
page = 1
max_pages = 50

try:
    while True:
        
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a.docsum-title"))
        )
        
       
        for article in driver.find_elements(By.CSS_SELECTOR, "div.docsum-content"):
            try:
                headline = article.find_element(
                    By.CSS_SELECTOR, "a.docsum-title"
                ).text.strip()
            except:
                headline = 0
            
            
            try:
                citation = article.find_element(
                    By.CSS_SELECTOR, "span.docsum-journal-citation.full-journal-citation"
                ).text
            except:
                try:
                    citation = article.find_element(
                        By.CSS_SELECTOR, "span.docsum-journal-citation"
                    ).text
                except:
                    citation = ""
            
            match = re.search(r"\b(19|20)\d{2}\b", citation)
            year = match.group(0) if match else "N/A"
            
            data.append({
                "Category": "research",
                "Headline": headline,
                "Year": year
            })
        
        
        if page >= max_pages:
            print(f"Reached page {page} (limit), stopping.")
            break
        
        
        try:
            next_btn = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.load-button.next-page"))
            )
            next_btn.click()
            page += 1
            time.sleep(1)
        except:
            
            break

finally:
    driver.quit()

df = pd.DataFrame(data)
print(df)

Reached page 50 (limit), stopping.
       Category                                           Headline  Year
0      research  The effectiveness of a mindfulness-based stres...  2025
1      research  Association of urbanization-related factors wi...  2025
2      research  Endocrine dysfunction in patients with juvenil...  2025
3      research  Disparities in cardiovascular disease outcomes...  2025
4      research        FP10 - the Competitiveness Fund for Europe.  2025
...         ...                                                ...   ...
12735  research  Effect of Airway Devices on Emergence Delirium...  2025
12736  research  Fluid balance and outcome in cardiac arrest pa...  2025
12737  research  Vitrectomy, subretinal Tissue plasminogen acti...  2025
12738  research  Clinical effectiveness of drop-in mental healt...  2025
12739  research  Efficacy of an internet-based, therapist-guide...  2025

[12740 rows x 3 columns]


# treatments scrapping

In [4]:
url2 = "https://pubmed.ncbi.nlm.nih.gov/?term=new+treatments&filter=pubt.classicalarticle&filter=pubt.clinicalstudy&filter=pubt.clinicaltrial&filter=pubt.news&filter=pubt.newspaperarticle&filter=years.2015-2025&sort=date"

driver = webdriver.Chrome()
driver.get(url2)

In [5]:
page = 1
max_pages = 50

try:
    while True:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a.docsum-title"))
        )
        

        for article in driver.find_elements(By.CSS_SELECTOR, "div.docsum-content"):
            # Headline
            try:
                headline = article.find_element(
                    By.CSS_SELECTOR, "a.docsum-title"
                ).text.strip()
            except:
                headline = 0
            

            try:
                citation = article.find_element(
                    By.CSS_SELECTOR,
                    "span.docsum-journal-citation.full-journal-citation"
                ).text
            except:
                try:
                    citation = article.find_element(
                        By.CSS_SELECTOR,
                        "span.docsum-journal-citation"
                    ).text
                except:
                    citation = ""
            
            match = re.search(r"\b(19|20)\d{2}\b", citation)
            year = match.group(0) if match else "N/A"
            
            data.append({
                "Category": "treatments",
                "Headline":  headline,
                "Year":      year
            })
        

        if page >= max_pages:
            print(f"Reached page {page}, stopping scrape.")
            break
        

        try:
            next_btn = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.load-button.next-page"))
            )
            next_btn.click()
            page += 1
            time.sleep(1)
        except:

            break

finally:
    driver.quit()

df = pd.DataFrame(data)
print(df)

Reached page 50, stopping scrape.
         Category                                           Headline  Year
0        research  The effectiveness of a mindfulness-based stres...  2025
1        research  Association of urbanization-related factors wi...  2025
2        research  Endocrine dysfunction in patients with juvenil...  2025
3        research  Disparities in cardiovascular disease outcomes...  2025
4        research        FP10 - the Competitiveness Fund for Europe.  2025
...           ...                                                ...   ...
25455  treatments  Phase 1 study of IMCnyeso, a T cell receptor b...  2025
25456  treatments  Effects of tDCS with concurrent cognitive perf...  2025
25457  treatments  Web-Based Application for Reducing Methampheta...  2025
25458  treatments  The Effect of Predicted Compliance With a Web-...  2025
25459  treatments  Interim analysis of the long-term efficacy and...  2025

[25460 rows x 3 columns]


# tech scrapping

In [6]:
url3 = "https://pubmed.ncbi.nlm.nih.gov/?term=health+technology&filter=pubt.classicalarticle&filter=pubt.clinicalstudy&filter=pubt.clinicaltrial&filter=pubt.news&filter=pubt.newspaperarticle&filter=years.2015-2025&sort=date"

driver = webdriver.Chrome()
driver.get(url3)

In [7]:
page = 1
max_pages = 50

try:
    while True:

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a.docsum-title"))
        )
        

        for article in driver.find_elements(By.CSS_SELECTOR, "div.docsum-content"):

            try:
                headline = article.find_element(
                    By.CSS_SELECTOR, "a.docsum-title"
                ).text.strip()
            except:
                headline = 0
            

            try:
                citation = article.find_element(
                    By.CSS_SELECTOR,
                    "span.docsum-journal-citation.full-journal-citation"
                ).text
            except:
                try:
                    citation = article.find_element(
                        By.CSS_SELECTOR,
                        "span.docsum-journal-citation"
                    ).text
                except:
                    citation = ""
            
            match = re.search(r"\b(19|20)\d{2}\b", citation)
            year = match.group(0) if match else "N/A"
            
            data.append({
                "Category": "technology",
                "Headline":  headline,
                "Year":      year
            })
        

        if page >= max_pages:
            print(f"Reached page {page}, stopping scrape.")
            break
        

        try:
            next_btn = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.load-button.next-page"))
            )
            next_btn.click()
            page += 1
            time.sleep(1)
        except:

            break

finally:
    driver.quit()

df = pd.DataFrame(data)
print(df)

Reached page 50, stopping scrape.
         Category                                           Headline  Year
0        research  The effectiveness of a mindfulness-based stres...  2025
1        research  Association of urbanization-related factors wi...  2025
2        research  Endocrine dysfunction in patients with juvenil...  2025
3        research  Disparities in cardiovascular disease outcomes...  2025
4        research        FP10 - the Competitiveness Fund for Europe.  2025
...           ...                                                ...   ...
38195  technology  A prospective, single-blinded, non-inferiority...  2025
38196  technology  Biomarkers of inflammation in sweat after myoc...  2025
38197  technology  Chemo-conization in Early-sTage cERvical caNce...  2025
38198  technology  Efficacy and safety of nerinetide in acute isc...  2025
38199  technology  Visual Function Benefit After Treatment With P...  2025

[38200 rows x 3 columns]


In [8]:
df.to_csv("treatments.csv", index=False)

# medaicl news

In [9]:
url = "https://www.medicalnewstoday.com/news"

In [10]:
resp = get(url)

tree = HTMLParser(resp.text)

headlines = [h.text() for h in tree.css("h2")]
summaries = [s.text() for s in tree.css("p a")]
dates = [d.text() for d in tree.css("figure + div > div")]


In [11]:
df2 = pd.DataFrame(
        {
            "Headline": headlines,
            "Summary": summaries,
            "Date": dates
        }
    )

df2['Date'] = pd.to_datetime(df2['Date'])
df2.head()

Unnamed: 0,Headline,Summary,Date
0,Alcohol use could contribute to dementia by da...,Heavy and former heavy drinking is associated ...,2025-04-14
1,Can weight training protect your brain from de...,Weight training may help protect the brains of...,2025-04-10
2,Diabetes drugs like Ozempic or Jardiance may s...,Two common types of anti-diabetes medications ...,2025-04-09
3,Certain combos of common food additives may ra...,"Two certain mixtures of common food additives,...",2025-04-09
4,Expert Q&A: Should you get a measles vaccine b...,A board-certified family medicine physician ex...,2025-04-08


In [12]:
df2.to_csv("news.csv", index=False)

# africa

In [13]:
urlafrica = "https://www.afro.who.int/news/feature_stories"

driver = webdriver.Chrome()
driver.get(urlafrica)

In [14]:
data2 = []

try:
   while True:

    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".teaser-full__content"))
    )
    
    
    for art in driver.find_elements(By.CSS_SELECTOR, ".col-xs-12.col-sm-9.teaser-full__content"):

        try:
            title = art.find_element(By.CSS_SELECTOR, ".teaser-full__title").text.strip()
        except:
            title = ""

        try:
            date = art.find_element(By.CSS_SELECTOR, ".field--name-field-date").text.strip()
        except:
            date = ""
)
        try:
            summary = art.find_element(By.CSS_SELECTOR, ".teaser-full__content p").text.strip()
        except:
            summary = ""
        
        
        data2.append({
            "Region":  "Africa",
            "Date":    date,
            "Title":   title,
            "Summary": summary
        })
    

    try:
        nxt = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "li.pager__item.pager__item--next a"))
        )
        nxt.click()
        time.sleep(1)
    except:
        break  

finally:
    driver.quit()

df3= pd.DataFrame(data2)
print(df3)

     Region             Date  \
0    Africa    14 April 2025   
1    Africa    11 April 2025   
2    Africa    10 April 2025   
3    Africa    07 April 2025   
4    Africa    07 April 2025   
..      ...              ...   
443  Africa   30 August 2019   
444  Africa   29 August 2019   
445  Africa   28 August 2019   
446  Africa   03 August 2019   
447  Africa  11 January 2019   

                                                 Title  \
0    Congo: Effectively raising community awareness...   
1    Angola takes measures to improve access to saf...   
2    Congo: Innovative tools to reduce maternal mor...   
3    Niger: improving access to services to reduce ...   
4    Community approach contributing to maternal he...   
..                                                 ...   
443  Workshop to develop a Regional Score Card to M...   
444  Protecting the devoted from cholera in pilgrim...   
445  Searching for polio in unusual places in Tanzania   
446  South Africa’s sugar tax: Succ

# america

In [15]:
urlamerica = "https://www.paho.org/en/news/news-releases"

driver = webdriver.Chrome()
driver.get(urlamerica)

In [16]:
try:
    while True:

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((
            By.CSS_SELECTOR,
            "#views-bootstrap-taxonomy-term-page-1 .row"
        )))
        

        cards = driver.find_elements(
            By.CSS_SELECTOR,
            "#views-bootstrap-taxonomy-term-page-1 .row div.col-xs-12.col-sm-12.col-md-4.col-lg-4"
        )
        
        for card in cards:

            try:
                date = card.find_element(
                    By.CSS_SELECTOR,
                    ".views-field-created .field-content time"
                ).text.strip()
            except:
                date = ""

            try:
                link_el = card.find_element(
                    By.CSS_SELECTOR,
                    ".views-field-title .field-content a"
                )
                title = link_el.text.strip()
            except:
                title = ""
  

            try:
                summary = card.find_element(
                    By.CSS_SELECTOR,
                    ".views-field-body .field-content"
                ).text.strip()
            except:
                summary = ""
            
            data2.append({
                "Region":"Americas",
                "Date":    date,
                "Title":   title,
                "Summary": summary
            })
        
        
        try:
            nxt = driver.find_element(
                By.CSS_SELECTOR,
                "li.pager__item.pager__item--next a"
            )
            nxt.click()
                
            time.sleep(1)  
        except:

            break

finally:
    driver.quit()

df3 = pd.DataFrame(data2)   
df3

Unnamed: 0,Region,Date,Title,Summary
0,Africa,14 April 2025,Congo: Effectively raising community awareness...,Brazzaville – The Republic of the Congo is one...
1,Africa,11 April 2025,Angola takes measures to improve access to saf...,"Luanda ‒ Manuel Domingos, community leader of ..."
2,Africa,10 April 2025,Congo: Innovative tools to reduce maternal mor...,Brazzaville – Congo is intensifying efforts to...
3,Africa,07 April 2025,Niger: improving access to services to reduce ...,"Niamey – When 19-year-old Aichatou, from Birni..."
4,Africa,07 April 2025,Community approach contributing to maternal he...,"Dakar – Ndèye, 29, mother of three children an..."
...,...,...,...,...
3090,Americas,2 Jun 2008,World No Tobacco Day 2008 Observed with Awards...,"Washington, D.C., May 30, 2008 (PAHO)—World No..."
3091,Americas,29 May 2008,PAHO Releases New Wind Hazard Maps for the Car...,"Washington, D.C., May 20, 2008 (PAHO)—The Pan ..."
3092,Americas,17 Jun 2007,International Health Regulations Enter into Force,"Washington, D.C., June 14, 2007 (PAHO) - The r..."
3093,Americas,23 Apr 2005,Vth Meeting of the Surveillance Network for Em...,Meeting sponsored by PAHO and three US governm...


# Europe

In [20]:
urlseasia = "https://www.who.int/europe/news"

driver = webdriver.Chrome()
driver.get(urlseasia)

In [21]:
try:
    while True:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((
                By.CSS_SELECTOR,
                "div.list-view--item.vertical-list-item"
            ))
        )

        cards = driver.find_elements(
            By.CSS_SELECTOR,
            "div.list-view--item.vertical-list-item"
        )
        for card in cards:

            try:
                date = card.find_element(
                    By.CSS_SELECTOR,
                    ".table-cell.info .timestamp"
                ).text.strip()
            except:
                date = ""
            

            try:
                link = card.find_element(
                    By.CSS_SELECTOR,
                    ".table-cell.info .heading.text-underline"
                )
                title = link.text.strip()

            except:
                title = ""

            
            data2.append({
                "Region": "Europe",
                "Date":   date,
                "Title":  title,
                "Summary": ""
                
            })
        

        

        try:
            nxt = driver.find_element(
                By.CSS_SELECTOR,
                "li.pager__item.pager__item--next a"
            )
            nxt.click()
            time.sleep(1)
        except:
            break

finally:
    driver.quit()
    
df3 = pd.DataFrame(data2)   
df3

Unnamed: 0,Region,Date,Title,Summary
0,Africa,14 April 2025,Congo: Effectively raising community awareness...,Brazzaville – The Republic of the Congo is one...
1,Africa,11 April 2025,Angola takes measures to improve access to saf...,"Luanda ‒ Manuel Domingos, community leader of ..."
2,Africa,10 April 2025,Congo: Innovative tools to reduce maternal mor...,Brazzaville – Congo is intensifying efforts to...
3,Africa,07 April 2025,Niger: improving access to services to reduce ...,"Niamey – When 19-year-old Aichatou, from Birni..."
4,Africa,07 April 2025,Community approach contributing to maternal he...,"Dakar – Ndèye, 29, mother of three children an..."
...,...,...,...,...
3120,Europe,10 March 2025,WHO announces new collaborating centre on AI f...,
3121,Europe,7 March 2025,“I feel so much healthier in my body and my mi...,
3122,Europe,6 March 2025,Digital automation tools improve access and af...,
3123,Europe,4 March 2025,New data dashboard visualizes progress towards...,


# pacafic

In [22]:
arab = "https://www.who.int/westernpacific/news"

driver = webdriver.Chrome()
driver.get(arab)

In [23]:

try:
    while True:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((
                By.CSS_SELECTOR,
                "div.list-view--item.vertical-list-item"
            ))
        )
        

        cards = driver.find_elements(
            By.CSS_SELECTOR,
            "div.list-view--item.vertical-list-item"
        )
        for card in cards:

            try:
                date = card.find_element(
                    By.CSS_SELECTOR,
                    ".table-cell.info .timestamp"
                ).text.strip()
            except:
                date = ""
            

            try:
                link = card.find_element(
                    By.CSS_SELECTOR,
                    ".table-cell.info .heading.text-underline"
                )
                title = link.text.strip()

            except:
                title = ""

            
            data2.append({
                "Region": "Western Pacific",
                "Date":   date,
                "Title":  title,
                "Summary": ""
                
            })
        

        

        try:
            nxt = driver.find_element(
                By.CSS_SELECTOR,
                "li.pager__item.pager__item--next a"
            )
            nxt.click()
            time.sleep(1)
        except:
            break

finally:
    driver.quit()
    
df3 = pd.DataFrame(data2)   
df3

Unnamed: 0,Region,Date,Title,Summary
0,Africa,14 April 2025,Congo: Effectively raising community awareness...,Brazzaville – The Republic of the Congo is one...
1,Africa,11 April 2025,Angola takes measures to improve access to saf...,"Luanda ‒ Manuel Domingos, community leader of ..."
2,Africa,10 April 2025,Congo: Innovative tools to reduce maternal mor...,Brazzaville – Congo is intensifying efforts to...
3,Africa,07 April 2025,Niger: improving access to services to reduce ...,"Niamey – When 19-year-old Aichatou, from Birni..."
4,Africa,07 April 2025,Community approach contributing to maternal he...,"Dakar – Ndèye, 29, mother of three children an..."
...,...,...,...,...
3170,Western Pacific,25 April 2022,WHO announces winners of inaugural Western Pac...,
3171,Western Pacific,7 April 2022,Countries in the Western Pacific on the frontl...,
3172,Western Pacific,5 November 2021,East Asia and Pacific countries commit to endi...,
3173,Western Pacific,29 October 2021,Health leaders endorse action plan to end TB i...,


# Break out news

In [26]:
df3.to_csv("region_data.csv", index=False)

In [50]:
arab = "https://www.who.int/emergencies/disease-outbreak-news"

driver = webdriver.Chrome()
driver.get(arab)

In [51]:
records = []
pages_to_scrape = 3
current_page = 1

try:
    while current_page <= pages_to_scrape:

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((
            By.CSS_SELECTOR,
            "a.sf-list-vertical__item"
        )))
        

        cards = driver.find_elements(By.CSS_SELECTOR, "a.sf-list-vertical__item")
        for card in cards:

            try:
                date = card.find_element(
                    By.CSS_SELECTOR,
                    "h4.sf-list-vertical__title > span:nth-of-type(2)"
                ).text.strip()
            except:
                date = ""
            
  
            try:
                full_title = card.find_element(
                    By.CSS_SELECTOR,
                    "h4.sf-list-vertical__title span.trimmed"
                ).text.strip()
            except:
                full_title = ""
            

            if " - " in full_title:
                news, country = full_title.rsplit(" - ", 1)
            else:
                news, country = full_title, ""
            
            records.append({
                "Date":    date,
                "News":    news,
                "Country": country
            })

        if current_page == pages_to_scrape:
            break
        

        next_btn = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((
            By.CSS_SELECTOR,
            f"a.k-link.k-pager-nav[data-page='{current_page+1}']"
        )))
        next_btn.click()
        current_page += 1
        time.sleep(1)  

finally:
    driver.quit()


df4 = pd.DataFrame(records)
df4

Unnamed: 0,Date,News,Country
0,17 April 2025 |,Avian Influenza A(H5N1),Mexico
1,11 April 2025 |,Invasive meningococcal disease,Kingdom of Saudi Arabia
2,28 March 2025 |,Cholera,Angola
3,27 March 2025 |,Measles,United States of America
4,13 March 2025 |,Marburg virus disease– United Republic of Tanz...,
...,...,...,...
79,11 August 2023 |,Influenza A (H1N2) variant virus-United States...,
80,11 August 2023 |,Dengue,Bangladesh
81,28 July 2023 |,Circulating vaccine-derived poliovirus type 2 ...,Kenya
82,28 July 2023 |,Circulating vaccine-derived poliovirus type 2 ...,United Republic of Tanzania


In [52]:
df4.to_csv("break_out.csv", index=False)