In [None]:
import os
import requests

from bs4 import BeautifulSoup

In [None]:
category_faq_url_dict = {
    "我想要进行国内外转账及汇款": {
        "国内外转账及汇款": "https://www.hsbc.com.cn/help/faqs/transfer-and-payment/",
        "转账及汇款": "https://www.hsbc.com.cn/transfer-payments/faq/",
    },
    "我在使用个人网上银行、手机银行和微信服务号中碰到了疑难杂症，怎么办？": {
        "手机/微信银行": "https://www.hsbc.com.cn/help/faqs/digital-banking/",
        "个人网上银行": "https://www.hsbc.com.cn/ways-to-bank/online/faq/",
    },
    "我想要轻松管理账户、查看账单、了解服务费率及使用我的积分": {
        "账户服务": "https://www.hsbc.com.cn/help/faqs/accounts/",
        # "账户及账单通知": "https://www.hsbc.com.cn/ways-to-bank/online/faq/#notice",
        "汇丰卓越理财": "https://www.hsbc.com.cn/premier/faq/",
    },
    "我有外汇、投资及保险的问题": {
        "外汇、投资及保险": "https://www.hsbc.com.cn/help/faqs/investments-and-wealth-management/",
        "投资": "https://www.hsbc.com.cn/investments/faq/",
        "保险产品及服": "https://www.hsbc.com.cn/insurance/faq/",
    },
    "我想了解银行存款、个人住房贷款等问题": {
        "银行存款、个人住房贷款": "https://www.hsbc.com.cn/help/faqs/deposit-and-mortgage/",
        "存款服务": "https://www.hsbc.com.cn/deposits/faq/",
        "个人住房贷款": "https://www.hsbc.com.cn/mortgages/faq/",
        # "个人住房贷款商品介绍": "https://www.hsbc.com.cn/mortgages/products/home/", # not qa
    },
    "我想了解信用卡使用、额度管理、年费收取及信用卡积分等问题": {
        "信用卡": "https://www.hsbc.com.cn/help/faqs/credit-cards/",
        "卡片申请": "https://www.hsbc.com.cn/credit-cards/faq/apply/",
        "办卡进度": "https://www.hsbc.com.cn/credit-cards/faq/progress/",
        "卡片激活": "https://www.hsbc.com.cn/credit-cards/faq/activation/",
        "卡片管理": "https://www.hsbc.com.cn/credit-cards/faq/how-to-use/",
        "查账还款": "https://www.hsbc.com.cn/credit-cards/faq/pay-statement/",
        "额度管理": "https://www.hsbc.com.cn/credit-cards/faq/limit/",
        "我的积分": "https://www.hsbc.com.cn/credit-cards/faq/my-points/",
        "安全设置": "https://www.hsbc.com.cn/credit-cards/faq/safety/",
        "账户管理": "https://www.hsbc.com.cn/credit-cards/faq/account-management/",
        "分期管理": "https://www.hsbc.com.cn/credit-cards/faq/instalment-management/",
        "账单分期": "https://www.hsbc.com.cn/credit-cards/faq/bill-instalment/",
        "交易分期": "https://www.hsbc.com.cn/credit-cards/faq/transaction-instalment/",
        "现金分期": "https://www.hsbc.com.cn/credit-cards/faq/cash-instalment/",
        "自动交易分期": "https://www.hsbc.com.cn/credit-cards/faq/auto-instalment/",
        "汇享分": "https://www.hsbc.com.cn/credit-cards/faq/additional-instalment-payment/",
        "荟享贷": "https://www.hsbc.com.cn/credit-cards/faq/aloc/",
        # "商户分期": "https://www.hsbc.com.cn/credit-cards/", # issue
    },
}

In [None]:
def parse_element(element):
    try:
        question = element.find("h2", class_="dropdown-text")
        question = question.text.strip()
    except:
        try:
            question = element.find("h3", class_="dropdown-text")
            question = question.text.strip()
        except:
            question = None
    try:
        answer = element.find("div", class_="M-CONTMAST-RW-RBWM O-SMARTSPCGEN-DEV rich-text")
        answer = answer.text.strip()
    except:
        answer = None
    return question, answer

def parse_url(url):
    html_doc = requests.get(f"{url}")
    soup = BeautifulSoup(html_doc.text, 'html.parser')
    elements = soup.find_all("div", class_="O-SMARTSPCGEN-DEV O-ADVEXP-RW-RBWM row")
    qas = []
    for element in elements:
        q, a = parse_element(element)
        if q and a:
            qas.append((q,a))
    return qas

In [None]:
def scrape_qas(category_faq_url_dict):
    total = 0
    results = {}
    for category,url_dict in category_faq_url_dict.items():
        results[category] = {}
        for sub_category,url in url_dict.items():
            qas = parse_url(url)
            results[category][sub_category] = qas
            total += len(qas)
            print(f"category: {category}, sub_category: {sub_category}, num: {len(qas)}, url: {category_faq_url_dict[category][sub_category]}")
    print(f"total: {total}")
    return results

In [None]:
def dump_qa_to_file(results, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for category,qa_dict in results.items():
        file = f"{output_dir}/{category}.txt"
        with open(file, "w") as f:
            for sub_category,qas in qa_dict.items():
                f.write(f"### {sub_category}\n\n")
                for q,a in qas:
                    f.write(f"Question: {q}\n")
                    f.write(f"Answer: {a}\n\n")

In [None]:
results = scrape_qas(category_faq_url_dict)
dump_qa_to_file(results, "hsbc_faq")

category: 我想要进行国内外转账及汇款, sub_category: 国内外转账及汇款, num: 6, url: https://www.hsbc.com.cn/help/faqs/transfer-and-payment/
category: 我想要进行国内外转账及汇款, sub_category: 转账及汇款, num: 21, url: https://www.hsbc.com.cn/transfer-payments/faq/
category: 我在使用个人网上银行、手机银行和微信服务号中碰到了疑难杂症，怎么办？, sub_category: 手机/微信银行, num: 10, url: https://www.hsbc.com.cn/help/faqs/digital-banking/
category: 我在使用个人网上银行、手机银行和微信服务号中碰到了疑难杂症，怎么办？, sub_category: 个人网上银行, num: 42, url: https://www.hsbc.com.cn/ways-to-bank/online/faq/
category: 我想要轻松管理账户、查看账单、了解服务费率及使用我的积分, sub_category: 账户服务, num: 8, url: https://www.hsbc.com.cn/help/faqs/accounts/
category: 我想要轻松管理账户、查看账单、了解服务费率及使用我的积分, sub_category: 汇丰卓越理财, num: 14, url: https://www.hsbc.com.cn/premier/faq/
category: 我有外汇、投资及保险的问题, sub_category: 外汇、投资及保险, num: 7, url: https://www.hsbc.com.cn/help/faqs/investments-and-wealth-management/
category: 我有外汇、投资及保险的问题, sub_category: 投资, num: 36, url: https://www.hsbc.com.cn/investments/faq/
category: 我有外汇、投资及保险的问题, sub_category: 保险产品及服, num:

In [None]:
!mkdir hsbc_annual_report
!wget https://www.about.hsbc.com.cn/-/media/china/zh-cn/financial-information/210813-hsbc-bank-china-company-limited-2019-annual-report.pdf -P hsbc_annual_report/
!wget https://www.about.hsbc.com.cn/-/media/china/zh-cn/financial-information/hsbc-china-2020-annual-accounts.pdf -P hsbc_annual_report/
!wget https://www.about.hsbc.com.cn/-/media/china/zh-cn/financial-information/hsbc-china-2021-annual-accounts.pdf -P hsbc_annual_report/
!wget https://www.hsbc.com.cn/content/dam/hsbc/cn/docs/rural-bank/about-us/hsbc-china-2022-annual-accounts.pdf -P hsbc_annual_report/

In [None]:
!mkdir hsbc_product_document
!wget https://www.hsbc.com.cn/content/dam/hsbc/cn/docs/document-download/%E5%8D%93%E8%B6%8A%E7%90%86%E8%B4%A2%E6%AC%A2%E8%BF%8E%E6%89%8B%E5%86%8C.pdf -P hsbc_product_document
!wget https://www.hsbc.com.cn/content/dam/hsbc/cn/docs/premierelite/premierelite-welcome-cn.pdf -P hsbc_product_document
!wget https://www.hsbc.com.cn/content/dam/hsbc/cn/docs/document-download/tariff-of-accouts-and-services.pdf -P hsbc_product_document
!wget https://www.hsbc.com.cn/content/dam/hsbc/cn/docs/document-download/advance-welcome-guide-cn.pdf -P hsbc_product_document