In [1]:
!pip install requests
!pip install bs4
!pip install pandas

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [2]:
import spacy
import pandas as pd
from collections import Counter

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import concurrent.futures

search_term_input = input("Enter the search term: ")

url = f"https://pubmed.ncbi.nlm.nih.gov/?term={search_term_input}"

r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")

max_page = soup.find("label", class_="of-total-pages").text.strip()
max_page = int(max_page.replace("of ", "").replace(",", ""))

data = {"Name": [], "Authors": [], "Citation": [], "PMID": [], "Abstract": []}

def scrape_page(page_num):
    page_url = f"https://pubmed.ncbi.nlm.nih.gov/?term={search_term_input}&page={page_num}"
    page_response = requests.get(page_url)
    page_soup = BeautifulSoup(page_response.text, "html.parser")

    names = [i.text.strip() for i in page_soup.find_all("a", class_="docsum-title")]
    auth_name = [i.text.strip() for i in page_soup.find_all("span", class_="docsum-authors full-authors")]
    cite = [i.text.strip() for i in page_soup.find_all("span", class_="docsum-journal-citation full-journal-citation")]
    pmid = [i.text.strip() for i in page_soup.find_all("span", class_="docsum-pmid")]
    abstract_urls = ["https://pubmed.ncbi.nlm.nih.gov" + i['href'] for i in page_soup.find_all("a", class_="docsum-title")]

    abstracts = []
    for abstract_url in abstract_urls:
        abstract_response = requests.get(abstract_url)
        abstract_soup = BeautifulSoup(abstract_response.text, "html.parser")
        abstract_text = abstract_soup.find("div", class_="abstract-content selected")
        if abstract_text:
            abstracts.append(abstract_text.text.strip())
        else:
            abstracts.append("N/A")

    return {"Name": names, "Authors": auth_name, "Citation": cite, "PMID": pmid, "Abstract": abstracts}

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(scrape_page, range(1, max_page + 1))

for result in results:
    data["Name"].extend(result["Name"])
    data["Authors"].extend(result["Authors"])
    data["Citation"].extend(result["Citation"])
    data["PMID"].extend(result["PMID"])
    data["Abstract"].extend(result["Abstract"])

df = pd.DataFrame(data)
print(f"Below is the data scraped from PubMed, SEARCH = {search_term_input} : ")
print(df)

Enter the search term: Naturopathy
Below is the data scraped from PubMed, SEARCH = Naturopathy : 
                                                   Name  \
0                                          Naturopathy.   
1     [Naturopathy and complementary medicine in sma...   
2                    Naturopathy: a critical appraisal.   
3                               [Water in naturopathy].   
4     The Relevance of Naturopathy as a Therapeutic ...   
...                                                 ...   
1764  Impact of complementary oral enzyme applicatio...   
1765  Effect of tele-yoga on burnout, mental health ...   
1766  Patients with Advanced or Metastasised Non-Sma...   
1767  Zinc for the prevention or treatment of acute ...   
1768  Clinical effectiveness of patient-targeted fee...   

                                                Authors  \
0                                   Smith MJ, Logan AC.   
1                               Stanossek I, Wehrend A.   
2               

In [5]:
for i in df['Abstract']:
  df['Abstract'] = df['Abstract'].str.replace('\n', '')
  df['Abstract'] = df['Abstract'].str.replace('\t', '')
  df['Abstract'] = df['Abstract'].str.replace('\r', '')
  df['Abstract'] = df['Abstract'].str.replace('\xa0', '')

In [6]:
df

Unnamed: 0,Name,Authors,Citation,PMID,Abstract
0,Naturopathy.,"Smith MJ, Logan AC.",Med Clin North Am. 2002 Jan;86(1):173-84. doi:...,11795088,Naturopathic medicine is an eclectic form of p...
1,[Naturopathy and complementary medicine in sma...,"Stanossek I, Wehrend A.",Tierarztl Prax Ausg K Kleintiere Heimtiere. 20...,34157762,Naturopathic and complementary procedures atta...
2,Naturopathy: a critical appraisal.,Atwood KC 4th.,MedGenMed. 2003 Dec 30;5(4):39.,14745386,"""Naturopathic medicine"" is a recent manifestat..."
3,[Water in naturopathy].,Marktl W.,Forsch Komplementarmed Klass Naturheilkd. 2003...,12853717,
4,The Relevance of Naturopathy as a Therapeutic ...,"Dewangan S, Kumar BB.",Altern Ther Health Med. 2024 Apr;30(4):38-41.,38702164,Background: There are numer...
...,...,...,...,...,...
1764,Impact of complementary oral enzyme applicatio...,"Beuth J, Ost B, Pakdaman A, Rethfeldt E, Bock ...",Cancer Chemother Pharmacol. 2001 Jul;47 Suppl:...,11561873,
1765,"Effect of tele-yoga on burnout, mental health ...","Naveen KH, Singh D, Srinivasan S, Bhardwaj P, ...",Complement Ther Med. 2024 Dec;87:103109. doi: ...,39521190,
1766,Patients with Advanced or Metastasised Non-Sma...,"Schad F, Thronicke A, Hofheinz RD, Matthes H, ...",Cancers (Basel). 2024 Apr 22;16(8):1609. doi: ...,38672690,
1767,Zinc for the prevention or treatment of acute ...,"Hunter J, Arentz S, Goldenberg J, Yang G, Bear...",BMJ Open. 2021 Nov 2;11(11):e047474. doi: 10.1...,34728441,


In [7]:
df.to_csv('pubmed_data.csv', index=False)