# Import Library

In [2]:
from bs4 import BeautifulSoup
import requests 
import json
import os
from dotenv import load_dotenv

# Scraping Functions

In [3]:
def parseHTML(url):
    # Send GET request to url with headers
    load_dotenv(override=True)
    user_email = os.getenv('SCRAPING_USER_EMAIL')
    headers = {'user-agent' : 'Mozilla/5.0 (X11; Linux x86_64); {user_email}'}
    req = requests.get(url, headers = headers).text

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(req, 'lxml')
    return soup

def get_url(href):
    if href[0] != '/':
        href = '/' + href
    return 'https://www.alodokter.com' + href

def get_cvd_page():
    html_parsed = parseHTML(get_url('jantung'))
    # retrieve class menu-children from html
    parse = html_parsed.find('ul', class_='menu-children')
    return parse

def get_topic_page_with_disease_container(url):
    html_parsed = parseHTML(url)
    topic_title = html_parsed.find('div', class_='title-tag-container').find('h2').text
    subtopics = []
    tag_list = html_parsed.find('div', class_='disease-tag-container').find_all('a')

    for tag in tag_list:
        
        if tag.div.text.strip() != 'Pengertian':
            subtopic_title = tag.div.text.strip() 
            subtopic_url = get_url(tag['href'])
            subtopic_content = get_subtopic_content(subtopic_url)
            for topic in subtopic_content:
                subtopics.append(topic)
        else: 
            # First Subtopic Title
            subtopic_title = html_parsed.find('div', class_='title-tag-container').h1.text.strip()

            content_parse = html_parsed.find('div', class_='post-content')
            subtopic_content = ''
            for parse in content_parse.find_all():
                if parse.name == 'h3':
                    break
                if parse.name == 'p':
                    subtopic_content += parse.text.strip() + '\n'
                elif parse.name == 'ul':
                    for li in parse.find_all('li'):
                        subtopic_content += '- ' + li.text.lstrip('\n') + '\n'

            subtopic = {
                "subtopic": subtopic_title,
                "subtopic_url": url,
                "content": subtopic_content
            }

            subtopics.append(subtopic)
    
    topic = {
        "topic": topic_title,
        "url" : url,
        "subtopics" : subtopics
    }

    return topic

def get_subtopic_content(url):
    html_parsed = parseHTML(url)
    subtopics = []

    # First Subtopic Title
    subtopic_title = html_parsed.find('div', class_='title-tag-container').h1.text.strip()

    content_parse = html_parsed.find('div', class_='post-content')
    subtopic_content = ''
    for parse in content_parse.find_all():
        if parse.name == 'h3' or parse.find('strong') is not None:
            break
        if parse.name == 'p':
            # parse text in p and add new line
            subtopic_content += parse.text.strip() + '\n'
        elif parse.name == 'ul':
            for li in parse.find_all('li'):
                subtopic_content += '- ' + li.text.lstrip('\n') + '\n'

    subtopic = {
        "subtopic": subtopic_title,
        "subtopic_url": url,
        "content": subtopic_content
    }

    subtopics.append(subtopic)
        
    additional_title = subtopic_title
    # Next Subtopic Title
    # find all component (h3, h4, or p) that contains 'strong' component inside the component
    parent_elements = html_parsed.find_all(
        lambda tag: tag.name in ['h3', 'h4', 'p'] and tag.find('strong')
    )
    
    for parse in parent_elements:
        # loop components after h3 until it finds new h3
        subtopic_title = additional_title + ': ' + parse.text.strip()
        subtopic_content = ''

        content_parse = parse.find_next_sibling()
        # while content_parse is not None and content_parse that is not li does not contain strong inside the component
        while content_parse is not None and (content_parse.name == 'ul' or content_parse.find('strong') is None):
            if content_parse.name == 'p':
                subtopic_content += content_parse.text.strip() + '\n'
            elif content_parse.name == 'ul':
                for li in content_parse.find_all('li'):
                    # add thick dot chara
                    subtopic_content += '- ' + li.text.lstrip('\n') + '\n'
            content_parse = content_parse.find_next_sibling()

        subtopic = {
            "subtopic": subtopic_title,
            "subtopic_url": url,
            "content": subtopic_content,
        }

        subtopics.append(subtopic)

    return subtopics

def get_topic_page(url):
    html_parsed = parseHTML(url)
    subtopics = []

    # First Subtopic Title
    topic_title = html_parsed.find('div', class_='title-tag-container').h1.text.strip()
    subtopic_title = "Pengertian " + topic_title

    content_parse = html_parsed.find('div', class_='post-content')
    subtopic_content = ''
    for parse in content_parse.find_all():
        if parse.name == 'h3' or parse.name == 'h4':
            break
        if parse.name == 'p':
            # parse text in p and add new line
            subtopic_content += parse.text.strip() + '\n'
        elif parse.name == 'ul':
            for li in parse.find_all('li'):
                subtopic_content += '- ' + li.text.lstrip('\n') + '\n'

    subtopic = {
        "subtopic": subtopic_title,
        "subtopic_url": url,
        "content": subtopic_content
    }

    subtopics.append(subtopic)
    
    parent_elements = html_parsed.find_all(
        lambda tag: tag.name in ['h3', 'h4', 'p'] and (tag.find('strong') or tag.find('b'))
    )
    additional_title = ''

    # Next Subtopic Title
    for parse in parent_elements:
        # loop components after h3 until it finds new h3
        if parse.name == 'h3':
            subtopic_title = parse.text.strip()
            additional_title = subtopic_title.strip()
        elif parse.name == 'h4':
            subtopic_title = additional_title + ': ' + parse.text.strip()
        subtopic_content = ''

        content_parse = parse.find_next_sibling()
        while content_parse is not None and (content_parse.name == 'ul' or (content_parse.find('strong') is None and content_parse.find('b') is None)) and content_parse.name != 'h3':
            if content_parse.name == 'p':
                subtopic_content += content_parse.text.strip() + '\n'
            elif content_parse.name == 'ul':
                for li in content_parse.find_all('li'):
                    subtopic_content += '- ' + li.text.lstrip('\n') + '\n'
            content_parse = content_parse.find_next_sibling()

        subtopic = {
            "subtopic": subtopic_title,
            "subtopic_url": url,
            "content": subtopic_content,
        }

        subtopics.append(subtopic)
    
    topic = {
        "topic": topic_title,
        "url" : url,
        "subtopics" : subtopics
    }

    return topic

def get_topic_content(url):
    html = parseHTML(url)
    if html.find('div', class_='disease-container') is not None:
        return get_topic_page_with_disease_container(url)
    else:
        return get_topic_page(url)

def get_all_content():
    topic_data = []
    html = get_cvd_page()
    for parse in html.find_all('li', class_='index-item'):
        href = parse.find('a')['href']
        topic_data.append(get_topic_content(get_url(href)))
    return topic_data

In [3]:
# Get all content
data = get_all_content()

In [6]:
# Convert list of topics to JSON
with open('../data/cvd.json', 'w') as f:
    json.dump(data, f, indent=4)