In [1]:
!pip install -q html2text beautifulsoup4

In [2]:
import urllib.request as urllib2
import urllib.parse as parseurl
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import html2text as h2t
import csv

import os
import re

In [3]:
import requests
from xml.etree import ElementTree as ET

url = "https://www.dbs.com.sg/sitemap-personal-2023.xml"
response = requests.get(url)
print(response.content)



In [4]:
root = ET.fromstring(response.content)
print(root)

<Element '{http://www.sitemaps.org/schemas/sitemap/0.9}urlset' at 0x7b5dc7802bb0>


In [5]:
#urls = []
#for url_tag in root.findall('.//url/loc'):
#  urls.append(url_tag.text)

In [6]:
urls = []
def dig_hierarchy(element):
  if(element.tag.endswith('loc')):
    url = element.text
    if(url.startswith('https://www.dbs.com.sg/personal/support/')
        and url.endswith('.html')
        and url.endswith('.html')
        and '/home.html' not in url
        and '/header' not in url
        and '/footer.html' not in url):
      urls.append(url)
      print(element.text)
  for child in element:
      dig_hierarchy(child)

dig_hierarchy(root)
print(len(urls))

https://www.dbs.com.sg/personal/support/bank-ibanking-digital-token-benefits.html
https://www.dbs.com.sg/personal/support/bank-ibanking-digital-token-requirements.html
https://www.dbs.com.sg/personal/support/bank-ibanking-replace-secure-device.html
https://www.dbs.com.sg/personal/support/bank-local-funds-transfer-future-transfer.html
https://www.dbs.com.sg/personal/support/bank-ssb-paylah-bill-payment.html
https://www.dbs.com.sg/personal/support/card-application-cpf-web-linkup-service.html
https://www.dbs.com.sg/personal/support/card-payment-cut-off-times.html
https://www.dbs.com.sg/personal/support/card-payment-giro-application.html
https://www.dbs.com.sg/personal/support/digi.html
https://www.dbs.com.sg/personal/support/general-card-security-transaction-alerts.html
https://www.dbs.com.sg/personal/support/guide-homeloan-repricing-documents.html
https://www.dbs.com.sg/personal/support/guide-unexpected-moments.html
https://www.dbs.com.sg/personal/support/investment-vickers-link-cdp-to-v

In [7]:
def fetch_html(url):
  request = urllib2.Request(url, headers={"User-Agent": "Mozilla/5.0"})
  response = urllib2.urlopen(request)
  return response.read().decode("utf-8")

In [8]:
def extract_page_name(url):
    """Extracts the page name from a URL.

    Args:
        url: The URL to extract the page name from.

    Returns:
        The page name, or None if the URL is invalid.
    """
    last_slash_index = url.rfind("/")
    if last_slash_index == -1:
        return parseurl.quote(url)

    # Extract the substring after the last forward slash
    page_name = url[last_slash_index + 1:]

    # Remove any file extension
    if "." in page_name:
        page_name = page_name[:page_name.index(".")]

    return page_name

In [9]:
def clean_markdown(url):
  try:
    html = fetch_html(url)
    soup = BeautifulSoup(html, 'html.parser')

    # replace top nav bar list of links with a section text
    nav = soup.find("nav")
    if nav:
      nav_texts = []
      for li in nav.find_all("li"):
          nav_texts.append(li.text)
      nav.extract()
      title = ' -> '.join(nav_texts)
    else:
      title = extract_page_name(url)

    #if(soup.title):
    #  title = soup.title.string  # webpage title tag
    #print(title)

    # add header from nav or from webpage title
    header = soup.new_tag("h1")
    header.string = title
    soup.body.insert(0, header)

    # remove all images
    for img in soup.find_all("img"):
        img.extract()

    # remove empty links and fix relative links
    for a in soup.find_all("a"):
      if not a.has_attr("href"):
          a.extract()
      elif "http" not in a["href"]:
          a["href"] = urljoin(url, a["href"])

    # remove CSS links
    for link in soup.find_all("link"):
        if link.get("type") == "text/css":
            link.extract()

    # remove feedback form
    for tag in soup.find_all(class_=["survey-wrapper", "support-form-fields", "feedback-msg"]):
        tag.extract()
    markdown = h2t.html2text(str(soup), bodywidth=0)
  except Exception as e:
    print(f"Error occurred for URL: {url}")
    print(e)
    return '', parseurl.quote(url)
  return markdown, title

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
filemap = []
folder = '/content/drive/MyDrive/work/DBS/support'
for url in urls:
  markdown, title = clean_markdown(url)
  #print('\n\n\n\n\n')
  #print(url)
  #print(markdown)

  # create md file with title as file name
  filename = re.sub(r"[^\w\s_\-\.]", "", title).strip() # remove special characters
  filename = re.sub(" - ", "-", filename) # remove spaces around hyphens
  filename = re.sub(r"[\s]+", "_", filename) # replace spaces with underscores
  filename = filename.lower() + ".md"
  filemap.append((filename, url))

  filepath = os.path.join(folder, filename)
  with open(filepath, "w") as f:
      f.write(markdown)
  print("File saved to:", filepath)

mapfile = os.path.join(folder, "file_map.csv")
with open(mapfile, "w", newline="") as file:
  writer = csv.writer(file)
  header = ['FileName', 'URL']
  writer.writerow(header)
  writer.writerows(filemap)

File saved to: /content/drive/MyDrive/work/DBS/support/bank-ibanking-digital-token-benefits.md
File saved to: /content/drive/MyDrive/work/DBS/support/bank-ibanking-digital-token-requirements.md
File saved to: /content/drive/MyDrive/work/DBS/support/bank-ibanking-replace-secure-device.md
File saved to: /content/drive/MyDrive/work/DBS/support/help_support-banking-funds_transfer_in_advance.md
File saved to: /content/drive/MyDrive/work/DBS/support/help_support-banking-pay_bills_using_dbs_paylah.md
File saved to: /content/drive/MyDrive/work/DBS/support/card-application-cpf-web-linkup-service.md
File saved to: /content/drive/MyDrive/work/DBS/support/help_support-credit_cards-credit_card_payment_cut-off_times.md
File saved to: /content/drive/MyDrive/work/DBS/support/help_support-credit_cards-set_up_giro_payment_for_dbsposb_credit_card_bill.md
File saved to: /content/drive/MyDrive/work/DBS/support/digi.md
File saved to: /content/drive/MyDrive/work/DBS/support/general-card-security-transaction-