In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os

In [2]:
journals = [
    {"index" : "7008","type"  : "Journal","topic" : "Critical Care Nursing"},
    {"index" : "7611","type"  : "Journal","topic" : "Geriatrics"},
    {"index" : "6138","type"  : "Journal","topic" : "Health Diversity"},
    {"index" : "7810","type"  : "Journal","topic" : "Mental Health & Psychiatirc Nursing"},
    {"index" : "10204","type" : "Journal","topic" : "Nurse Career & Education"},
    {"index" : "7657","type"  : "Journal","topic" : "Nurse Practitioners"},
    {"index" : "7807","type"  : "Journal","topic" : "OB GYN & Women's Health Nursing"},
    {"index" : "6132","type"  : "Journal","topic" : "Pain Management"},
    {"index" : "7624","type"  : "Journal","topic" : "Palliative Care"},
    {"index" : "7666","type"  : "Journal","topic" : "Patient Safety"},
    {"index" : "7808","type"  : "Journal","topic" : "Pediatrics Neonatal Care Nursing"},
    {"index" : "7809","type"  : "Journal","topic" : "Perioperative Nursing"},
    {"index" : "10645","type" : "Journal","topic" : "Zika Virus"},
    {"index" : "3719","type"  : "Article","topic" : "Critical Care Nursing"},
    {"index" : "3746","type"  : "Article","topic" : "Geriatrics"},
    {"index" : "3787","type"  : "Article","topic" : "Mental Health & Psychiatirc Nursing"},
    {"index" : "6153","type"  : "Article","topic" : "Health Diversity"},
    {"index" : "10202","type" : "Article","topic" : "Nurse Career & Education"},
    {"index" : "3793","type"  : "Article","topic" : "Nurse Practitioners"},
    {"index" : "3797","type"  : "Article","topic" : "OB GYN & Women's Health Nursing"},
    {"index" : "6147","type"  : "Article","topic" : "Pain Management"},
    {"index" : "3546","type"  : "Article","topic" : "Palliative Care"},
    {"index" : "3807","type"  : "Article","topic" : "Patient Safety"},
    {"index" : "3813","type"  : "Article","topic" : "Pediatrics Neonatal Care Nursing"},
    {"index" : "3818","type"  : "Article","topic" : "Perioperative Nursing"},
    {"index" : "10643","type" : "Article","topic" : "Zika Virus"}
]

protocols = [
    {"type":"preventive-health-care-protocols", "title":"adult-vaccination"},
    {"type":"preventive-health-care-protocols", "title":"body-mass-index-control"},
    {"type":"preventive-health-care-protocols", "title":"cancer-screening"},
    {"type":"preventive-health-care-protocols", "title":"cardiovascular-risk-assessment"},
    {"type":"administrative-care-protocols", "title":"gpfirst"},
    {"type":"administrative-care-protocols", "title":"health-plan"},
    {"type":"preventive-health-care-protocols", "title":"smoking-cessation"},
    {"type":"chronic-care-protocols", "title":"pre-diabetes-mellitus"},
    {"type":"chronic-care-protocols", "title":"diabetes-mellitus"},
    {"type":"chronic-care-protocols", "title":"hypertension"},
    {"type":"chronic-care-protocols", "title":"lipid-disorders"},
    {"type":"chronic-care-protocols", "title":"multimorbidity-diabetes-hypertension-and-hyperlipidaemia"},
]

# The number of page to scrap, each page has 20-22 articles.
num_pg = 2

# Medscape URL
medscape_url = "https://www.medscape.com/index/list_"

# Primarycare.sg URL
primarycare_url = "https://www.primarycarepages.sg/healthier-sg/care-protocols/"

In [3]:
# def find_page: Find Number of pages within the article
def find_page(main_url):
    response = requests.get(main_url)
    pages = []
    # Check if request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all the links on the page
        links = soup.findAll('div', {'class': 'sections-nav'})
        for link in links:
            page_num = link.findChildren('li')

            #Append the links to the url
            for i in range(2,len(page_num)):
                pages.append(main_url + "_" + str(i))
        
        return pages

    else:
        print("Failed to retrieve page:", response.status_code)
        return []

In [4]:
#def get_journal_content: Get Journal contents
def get_journal_content(main_url, title, topic, path):
    url_list = [main_url] + find_page(main_url)
    body = ""
    for url in url_list:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            contents = soup.findAll('div', {'id': 'article-content'})
            for content in contents:
                body = body + content.text
        else:
            print("Failed to retrieve page:", response.status_code)

    #Set dictionary
    temp = {
        'topic' : topic,
        'title' : title,
        'content': body
    }

    #Write into JSON file
    os.makedirs(path, exist_ok=True)
    with open(f"{path}/{''.join(e for e in title if e.isalnum())}.json", "w") as outfile: 
        json.dump(temp, outfile, indent = 4)

In [5]:
def get_article_content(url, title, topic, path): 
    body = ""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        try:
            #Certain Links gives error when trying to find the specific elements
            contents = soup.find('div', {'class': 'article__main-content'}).find('div',{'check-ads-type':'true'})
        except:
            #Links that gave error turns out use different tag value
            contents = soup.find('div', {'id': 'article-content'})
            # print("Article not Found")
            # return
        body = contents.text
    else:
        print("Failed to retrieve page:", response.status_code)

    #Set dictionary
    temp = {
        'topic' : topic,
        'title' : title,
        'content': body
    }

    #Write into JSON file
    
    with open(f"{path}/{''.join(e for e in title if e.isalnum())}.json", "w") as outfile: 
        json.dump(temp, outfile, indent = 4)

In [6]:
#def get_article_page: Loop through all articles in the topic
def get_article_page(url):
    for journal in journals:
        part_url = url + journal['index']
        for i in range (0,num_pg):
            full_url = part_url + "_" +str(i)
            response = requests.get(full_url)
            path = f"edu_output/{journal['type']}/{journal['topic']}"
            os.makedirs(path, exist_ok=True)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                articles = soup.findAll('div', {'id': 'archives'})
                for article in articles:
                    page_urls = article.findAll('a')
                    for page_url in page_urls:
                        pg_url = page_url.get('href').replace("//", "https://")
                        print(pg_url)
                        if journal['type'] == "Journal":
                            get_journal_content(pg_url, page_url.text, journal['topic'], path)
                        elif journal['type'] == "Article":
                            get_article_content(pg_url, page_url.text, journal['topic'], path)
            else:
                print(f"Error: {response.status_code}")
                break


In [7]:
#KIV. Not in use
def find_article_pg(main_url):
    topic = "Nurses Topics"
    response = requests.get(main_url)
    
    # Check if request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        list = []
        # Find all the links on the page
        links = soup.findAll('div', {'id' : 'view_specialty'})

        for link in links:
            children = link.findChildren('h2')
            for child in children:
                if child.text == topic:
                    temp = link
                    
        for item in temp.findAll('a'):
            list.append(item.get('href'))
            
        return list
    else:
        print("Failed to retrieve page:", response.status_code)
        return []

In [8]:
def get_protocols(url): 

    for protocol in protocols:
        body = ""
        path = f"edu_output/primarycare/{protocol['type']}"
        os.makedirs(path, exist_ok=True)
        full_url = url + f"{protocol['type']}/{protocol['title']}"
        print(full_url)
        response = requests.get(full_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            for content in soup.find_all('div', {'class' : 'expand-collapse'}):
                body = body + content.text

            temp = {
                'topic' : protocol['type'],
                'title' : protocol['title'],
                'content': body
            }
            
            with open(f"{path}/{''.join(e for e in protocol['title'] if e.isalnum())}.json", "w") as outfile: 
                json.dump(temp, outfile, indent = 4)
                
        else:
            print(f"Error: {response.status_code}")
            return

In [9]:
# Medscape
get_article_page(medscape_url)
# Primarycare
get_protocols(primarycare_url)

https://www.medscape.com/s/viewarticle/919856
https://www.medscape.com/s/viewarticle/887264


KeyboardInterrupt: 