# Import Library

In [1]:
from bs4 import BeautifulSoup
import requests 
import json
import os
from dotenv import load_dotenv

# Scraping Functions

In [5]:
def parseHTML(url):
    '''
    Args: 
        url (str): URL to be parsed

    Returns:
        soup (BeautifulSoup): Parsed HTML object
    '''

    # Mengirimkan GET request ke URL
    load_dotenv(override=True)
    user_email = os.getenv('SCRAPING_USER_EMAIL')
    headers = {'user-agent' : 'Mozilla/5.0 (X11; Linux x86_64); {user_email}'}
    req = requests.get(url, headers = headers).text

    # Melakukan parsing HTML dengan BeautifulSoup
    soup = BeautifulSoup(req, "lxml")

    return soup

def get_url(href):
    '''
    Args:
        href (str): Alodokter URL
    
    Returns:
        str: Alodokter link to certain page
    '''

    if href[0] != '/':
        href = '/' + href
    return 'https://www.alodokter.com' + href

def get_cvd_page():
    '''
    Args:
        None
    
    Returns:
        html_parsed (BeautifulSoup): Parsed HTML object of Alodokter Cardiovascular Disease Topics
    ''' 

    # Mengambil halaman utama kumpulan artikel penyakit jantung dari Alodokter (https://www.alodokter.com/jantung)
    html_parsed = parseHTML(get_url('jantung'))

    # Mengambil seluruh topik penyakit jantung
    parse = html_parsed.find('ul', class_='menu-children')

    return parse

# Pada artikel tentang penyakit jantung di Aldokter, terdapat dua jenis halaman:
# 1. Halaman yang memiliki subtopik yang terpisah setiap page-nya  
# 2. Halaman yang tidak memiliki kontainer penyakit (disease-container) dan tidak memiliki subtopik

def get_topic_page_with_disease_container(url):
    '''
    Args:
        url (str): URL to be parsed
    
    Returns:
        topic (dict): Dictionary containing topic title, url, and subtopics. 
                        Subtopics is a list of dictionaries containing subtopic title, url, and content.
    '''

    # Fungsi ini digunakan untuk mendapatkan konten dari laman artikel yang setiap subtopiknya terpisah dalam halaman yang berbeda
    # contoh: https://www.alodokter.com/hipertensi

    html_parsed = parseHTML(url)
    topic_title = html_parsed.find('div', class_='title-tag-container').find('h2').text
    subtopics = []
    tag_list = html_parsed.find('div', class_='disease-tag-container').find_all('a')

    for tag in tag_list:
        if tag.div.text.strip() != 'Pengertian':
            # Mengambil konten dari subtopik di halaman kedua dan seterusnya
            subtopic_title = tag.div.text.strip() 
            subtopic_url = get_url(tag['href'])
            subtopic_content = get_subtopic_content(subtopic_url)
            for topic in subtopic_content:
                subtopics.append(topic)
        else: 
            # Mengambil konten dari halaman utama (Pengertian)
            subtopic_title = html_parsed.find('div', class_='title-tag-container').h1.text.strip()

            content_parse = html_parsed.find('div', class_='post-content')
            subtopic_content = ''
            for parse in content_parse.find_all():
                if parse.name == 'h3':
                    break
                if parse.name == 'p':
                    subtopic_content += parse.text.strip() + '\n'
                elif parse.name == 'ul':
                    for li in parse.find_all('li'):
                        subtopic_content += '- ' + li.text.lstrip('\n') + '\n'

            subtopic = {
                "subtopic": subtopic_title,
                "subtopic_url": url,
                "content": subtopic_content
            }

            subtopics.append(subtopic)
    
    topic = {
        "topic": topic_title,
        "url" : url,
        "subtopics" : subtopics
    }

    return topic

def get_subtopic_content(url):
    '''
    Args:
        url (str): URL to be parsed
    
    Returns:
        subtopics (list): List of dictionaries containing subtopic title, url, and content.
    '''

    # Fungsi ini digunakan untuk mendapatkan konten dari subtopik di halaman pertama
    # contoh: https://www.alodokter.com/hipertensi

    html_parsed = parseHTML(url)
    subtopics = []

    # Mengambil konten teratas dari subtopik di halaman pertama
    subtopic_title = html_parsed.find('div', class_='title-tag-container').h1.text.strip()

    content_parse = html_parsed.find('div', class_='post-content')
    subtopic_content = ''
    for parse in content_parse.find_all():
        if parse.name == 'h3' or parse.find('strong') is not None:
            # Pada halaman pengertian, jika ada elemen h3 atau strong, maka itu adalah subtopik baru
            break
        if parse.name == 'p':
            subtopic_content += parse.text.strip() + '\n'
        elif parse.name == 'ul':
            for li in parse.find_all('li'):
                subtopic_content += '- ' + li.text.lstrip('\n') + '\n'

    subtopic = {
        "subtopic": subtopic_title,
        "subtopic_url": url,
        "content": subtopic_content
    }

    subtopics.append(subtopic)
        
    # Mengambil konten dari subtopik tambahan (selain pengertian) di halaman pertama
    additional_title = subtopic_title
    parent_elements = html_parsed.find_all(
        lambda tag: tag.name in ['h3', 'h4', 'p'] and tag.find('strong')
    )
    
    for parse in parent_elements:
        subtopic_title = additional_title + ': ' + parse.text.strip()
        subtopic_content = ''

        content_parse = parse.find_next_sibling()
        while content_parse is not None and (content_parse.name == 'ul' or content_parse.find('strong') is None):
            if content_parse.name == 'p':
                subtopic_content += content_parse.text.strip() + '\n'
            elif content_parse.name == 'ul':
                for li in content_parse.find_all('li'):
                    subtopic_content += '- ' + li.text.lstrip('\n') + '\n'
            content_parse = content_parse.find_next_sibling()

        subtopic = {
            "subtopic": subtopic_title,
            "subtopic_url": url,
            "content": subtopic_content,
        }

        subtopics.append(subtopic)

    return subtopics

def get_topic_page(url):
    '''
    Args:
        url (str): URL to be parsed

    Returns:
        topic (dict): Dictionary containing topic title, url, and subtopics. 
                        Subtopics is a list of dictionaries containing subtopic title, url, and content.
    '''

    # Fungsi ini digunakan untuk mendapatkan konten dari laman artikel yang setiap subtopiknya berada di satu halaman yang sama
    # contoh: https://www.alodokter.com/lemah-jantung
    
    html_parsed = parseHTML(url)
    subtopics = []

    # Mengambil konten teratas dari subtopik di halaman pertama
    topic_title = html_parsed.find('div', class_='title-tag-container').h1.text.strip()
    subtopic_title = "Pengertian " + topic_title

    content_parse = html_parsed.find('div', class_='post-content')
    subtopic_content = ''
    for parse in content_parse.find_all():
        if parse.name == 'h3' or parse.name == 'h4':
            break
        if parse.name == 'p':
            # parse text in p and add new line
            subtopic_content += parse.text.strip() + '\n'
        elif parse.name == 'ul':
            for li in parse.find_all('li'):
                subtopic_content += '- ' + li.text.lstrip('\n') + '\n'

    subtopic = {
        "subtopic": subtopic_title,
        "subtopic_url": url,
        "content": subtopic_content
    }

    subtopics.append(subtopic)
    
    # Mengambil konten dari subtopik tambahan (selain pengertian) di halaman pertama
    parent_elements = html_parsed.find_all(
        lambda tag: tag.name in ['h3', 'h4', 'p'] and (tag.find('strong') or tag.find('b'))
    )
    additional_title = ''

    for parse in parent_elements:
        if parse.name == 'h3':
            subtopic_title = parse.text.strip()
            additional_title = subtopic_title.strip()
        elif parse.name == 'h4':
            subtopic_title = additional_title + ': ' + parse.text.strip()
        subtopic_content = ''

        content_parse = parse.find_next_sibling()
        while content_parse is not None and (content_parse.name == 'ul' or (content_parse.find('strong') is None and content_parse.find('b') is None)) and content_parse.name != 'h3':
            if content_parse.name == 'p':
                subtopic_content += content_parse.text.strip() + '\n'
            elif content_parse.name == 'ul':
                for li in content_parse.find_all('li'):
                    subtopic_content += '- ' + li.text.lstrip('\n') + '\n'
            content_parse = content_parse.find_next_sibling()

        subtopic = {
            "subtopic": subtopic_title,
            "subtopic_url": url,
            "content": subtopic_content,
        }

        subtopics.append(subtopic)
    
    topic = {
        "topic": topic_title,
        "url" : url,
        "subtopics" : subtopics
    }

    return topic

def get_topic_content(url):
    ''' 
    Args:
        url (str): URL to be parsed
    Returns:
        topic (dict): Dictionary containing topic title, url, and subtopics. 
                        Subtopics is a list of dictionaries containing subtopic title, url, and content.
    '''

    html = parseHTML(url)
    if html.find('div', class_='disease-container') is not None:
        return get_topic_page_with_disease_container(url)
    else:
        return get_topic_page(url)

def get_all_content():
    '''
    Args:
        None
    Returns:
        topic_data (list): List of dictionaries containing topic title, url, and subtopics. 
                            Subtopics is a list of dictionaries containing subtopic title, url, and content.
    '''

    topic_data = []

    # Mengambil halaman utama kumpulan artikel penyakit jantung dari Alodokter (https://www.alodokter.com/jantung)
    html = get_cvd_page()

    num = 1

    # Mengambil seluruh topik penyakit jantung beserta konten di setiap subtopiknya
    for parse in html.find_all('li', class_='index-item'):
        href = parse.find('a')['href']
        print(f'[{num}/{len(html.find_all("li", class_="index-item"))}] {get_url(href)}')
        topic_data.append(get_topic_content(get_url(href)))
        num += 1

    print('Scraping done!')
    return topic_data

# Code Exectuion

In [6]:
# Eksekusi fungsi untuk melakukan scraping
data = get_all_content()

[1/45] https://www.alodokter.com/hipertensi
[2/45] https://www.alodokter.com/penyakit-jantung
[3/45] https://www.alodokter.com/gagal-jantung
[4/45] https://www.alodokter.com/serangan-jantung
[5/45] https://www.alodokter.com/hipotensi
[6/45] https://www.alodokter.com/angin-duduk
[7/45] https://www.alodokter.com/emboli
[8/45] https://www.alodokter.com/aritmia
[9/45] https://www.alodokter.com/syok-kardiogenik
[10/45] https://www.alodokter.com/lemah-jantung
[11/45] https://www.alodokter.com/penyakit-katup-jantung
[12/45] https://www.alodokter.com/jantung-berdebar
[13/45] https://www.alodokter.com/kardiomegali
[14/45] https://www.alodokter.com/tetralogy-of-fallot
[15/45] https://www.alodokter.com/takikardia
[16/45] https://www.alodokter.com/iskemia
[17/45] https://www.alodokter.com/bradikardia
[18/45] https://www.alodokter.com/miokarditis
[19/45] https://www.alodokter.com/hipertensi-pulmonal
[20/45] https://www.alodokter.com/endokarditis
[21/45] https://www.alodokter.com/patent-foramen-oval

In [7]:
# Simpan data ke dalam file JSON
with open('../data/cvd.json', 'w') as f:
    json.dump(data, f, indent=4)