In [65]:
import os
import re
import requests
from bs4 import BeautifulSoup

In [3]:
def get_url(keyword, page=1, start_year='', end_year='', page_size=''):
    """透過 & 串接搜尋條件，生成對應的 URL"""
    url = f"https://pubmed.ncbi.nlm.nih.gov/?term={keyword}&format=abstract"
    if page != 1:
        url = url + f"&page=2"
    if (start_year!='') & (end_year!=''):
        url = url + f"&filter=years.{start_year}-{end_year}"
    if page_size != '':
        url = url + f"&size={page_size}" # 10、20、50、100、200
    return url

In [4]:
url = get_url('IVF', page_size=50)
print(url)

https://pubmed.ncbi.nlm.nih.gov/?term=IVF&format=abstract&size=50


In [5]:
# get page from url
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
respond = requests.get(url, headers = headers)
soup = BeautifulSoup(respond.text)

In [8]:
# get results-amount
results_amount = soup.find("div", class_="results-amount")
try:
    total_articles = results_amount.span.text
    print(f'Results amount {total_articles}')
except:
    print('Cannot find results amount')

results amount 29,997


In [9]:
# get articles in page (主要爬取的物件)
article_ls = soup.find_all(class_="article-overview")
print('article in page', len(article_ls))

article in page 50


In [67]:
total_page = soup.find("label", class_="of-total-pages")
try:
    total_page = re.match('of (\d+)', total_page.text)[1]
    print(f'Total page: {total_page}')
except:
    print('Cannot find results amount')

Total page: 600


In [17]:
# 題目
[i.find("h1", class_="heading-title").text.strip() for i in article_ls][:5]

['The role of Natural Cycle IVF in assisted reproduction',
 'Emotions and ethical considerations of women undergoing IVF-treatments',
 'Physicochemical properties of follicular fluid and their relation to in vitro fertilization (IVF) outcome',
 'Genomic imprinting: a gene regulatory phenomenon with important implications for micromanipulation-assisted in vitro fertilization (IVF)',
 'Lesbian shared biological motherhood: the ethics of IVF with reception of oocytes from partner']

In [20]:
# 引用
[i.find("div", class_="article-citation").text.strip()  for i in article_ls][:5]

['Review\n\n\n      Best Pract Res Clin Endocrinol Metab\n    Actions\n              Search in PubMed\n            \n              Search in NLM Catalog\n            \n              Add to Search\n            . 2019 Feb;33(1):35-45.\n\n\n        doi: 10.1016/j.beem.2018.10.005.\n      \n\n        Epub 2018 Nov 9.',
 'HEC Forum\n    Actions\n              Search in PubMed\n            \n              Search in NLM Catalog\n            \n              Add to Search\n            . 2011 Dec;23(4):281-93.\n\n\n        doi: 10.1007/s10730-011-9159-4.',
 'Review\n\n\n      J In Vitro Fert Embryo Transf\n    Actions\n              Search in PubMed\n            \n              Search in NLM Catalog\n            \n              Add to Search\n            . 1990 Apr;7(2):67-73.\n\n\n        doi: 10.1007/BF01135577.',
 'Review\n\n\n      J In Vitro Fert Embryo Transf\n    Actions\n              Search in PubMed\n            \n              Search in NLM Catalog\n            \n              Add to 

In [16]:
# 類型
[i.find("div", class_="publication-type").text 
    if (i.find("div", class_="publication-type") != None) else '' 
    for i in article_ls ][:5]

['Review', '', 'Review', 'Review', '']

In [26]:
# 作者
[i.find("span", class_="authors-list-item").text.strip()
    if (i.find("span", class_="authors-list-item") != None) else '' 
    for i in article_ls ][:5]

['Michael von Wolff\xa0\n                1',
 'Sofia Kaliarnta\xa0\n                1\n              ,',
 'B Fisch\xa0\n                1\n              ,',
 'J W Gordon\xa0\n                1\n              ,',
 'Kristin Zeiler\xa0\n                1\n              ,']

In [27]:
# 聯絡資訊
[i.find("div", class_="affiliations").text.strip()
    if (i.find("div", class_="affiliations") != None) else '' 
    for i in article_ls ][:5]

["Affiliation\n          \n\n1 University Women's Hospital, Division of Gynaecological Endocrinology and Reproductive Medicine, Inselspital, University Hospital, Berne, Switzerland. Electronic address: Michael.vonWolff@insel.ch.",
 'Affiliation\n          \n\n1 Philosophy Department, Faculty of Technology, Policy and Management, Delft University of Technology, The Netherlands. S.Kaliarnta@tudelft.nl',
 'Affiliation\n          \n\n1 Department of Obstetrics and Gynecology, Beilinson Medical Center, Sackler School of Medicine, Tel-Aviv University, Petah-Tikvah, Israel.',
 'Affiliation\n          \n\n1 Molecular Biology, Mt. Sinai Medical Center, New York, New York 10029.',
 'Affiliation\n          \n\n1 Department of Medical and Health Sciences, Linköping University, Linköping, Sweden, Kristin.Zeiler@liu.se.']

In [41]:
# 識別碼 
['||'.join([xid.text.strip() for xid in i.find("ul", class_="identifiers").find_all("li")])
    if (i.find("ul", class_="identifiers") != None) else '' 
    for i in article_ls][:5]

['PMID:\n    \n  \n30473207||DOI:\n    \n  \n\n      10.1016/j.beem.2018.10.005',
 'PMID:\n    \n  \n21822635||PMCID:\n    \n  \n\n      PMC3258403||DOI:\n    \n  \n\n      10.1007/s10730-011-9159-4',
 'PMID:\n    \n  \n2193072||DOI:\n    \n  \n\n      10.1007/BF01135577',
 'PMID:\n    \n  \n2016564||DOI:\n    \n  \n\n      10.1007/BF01131585',
 'PMID:\n    \n  \n24395218||DOI:\n    \n  \n\n      10.1007/s11019-013-9538-5']

In [42]:
# 摘要
[i.find("div", class_="abstract-content").text.strip()
    if (i.find("div", class_="abstract-content") != None) else '' 
    for i in article_ls ][:5]

['Natural Cycle IVF (NC-IVF) with and without modifications is being increasingly performed. NC-IVF and conventional gonadotropin-stimulated IVF (cIVF) should not be understood as competing treatments, but as complementary treatments with different target groups and to some extent other indications. NC-IVF is particularly interesting for couples who wish to save money, wish a treatment with as few risks as possible and for women who would like to avoid selection and cryopreservation of embryos. NC-IVF therefore contributes to the concept of individualized and patient-oriented therapy. The time to pregnancy is slightly longer than with conventional IVF. NC-IVF is particularly suitable for younger women and for women with a very low ovarian reserve. In this article, the principles of NC-IVF, i.e. monofollicular IVF without gonadotropin stimulation, are described and the technical differences to cIVF, advantages and disadvantages, perinatal outcome and indications for NC-IVF are highlight

In [46]:
# keyword
[i.find("strong", class_="sub-title").parent.text.strip()
    if (i.find("strong", class_="sub-title") != None) else '' 
    for i in article_ls ][:5]

['Keywords:\n        \n      \n      IVF; Natural Cycle IVF; delivery rate; pregnancy rate.',
 '',
 '',
 '',
 '']

In [47]:
# 被引用狀態
['||'.join([xid.text.strip() for xid in i.find("div", class_="stats").find_all("li")])
    if (i.find("div", class_="stats") != None) else '' 
    for i in article_ls][:5]

['Cited by 7\n              articles',
 'Cited by 5\n              articles||25\n              references',
 'Cited by 7\n              articles||45\n              references',
 '48\n              references',
 'Cited by 2\n              articles||24\n              references']

In [57]:
# full text link
['||'.join(['{}({})'.format(xid.text.strip(), xid["href"]) for xid in i.find("div", class_="full-text-links-list").find_all("a")])
    if (i.find("div", class_="full-text-links-list") != None) else '' 
    for i in article_ls][:5]

['Elsevier Science(https://linkinghub.elsevier.com/retrieve/pii/S1521-690X(18)30119-2)',
 'Springer(https://dx.doi.org/10.1007/s10730-011-9159-4)||Free PMC article(https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/21822635/)',
 '',
 '',
 'Springer(https://doi.org/10.1007/s11019-013-9538-5)||Linkoping University Electronic Press(http://liu.diva-portal.org/smash/get/diva2:691759/FULLTEXT01.pdf)']

In [61]:
# paper page url
['https://pubmed.ncbi.nlm.nih.gov' + i.find("a", class_="details-link")["href"].strip()
    if (i.find("a", class_="details-link") != None) else '' 
    for i in article_ls ][:5]

['https://pubmed.ncbi.nlm.nih.gov/30473207/',
 'https://pubmed.ncbi.nlm.nih.gov/21822635/',
 'https://pubmed.ncbi.nlm.nih.gov/2193072/',
 'https://pubmed.ncbi.nlm.nih.gov/2016564/',
 'https://pubmed.ncbi.nlm.nih.gov/24395218/']