In [1]:
# インポート
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import clear_output
import os
from time import sleep
import re


In [140]:
# メソッドチェーンの定義（https://codereview.stackexchange.com/questions/249610/chaining-list-operations-in-python）
def chain(Accumulant, *Functions_list):
    for f in Functions_list:
        Accumulant = f(Accumulant)
    return Accumulant


def reduce_c(Function):
    def reductor(List):
        while len(List) > 1:
            List.insert(0, Function(List.pop(0), List.pop(0)))
        return List[0]
    return reductor

# explode a tuple and pass it as arguments to f


def splat_c(f): return lambda t: f(*t)
def map_c(f): return lambda _: list(map(f, _))
def filter_c(f): return lambda _: list(filter(f, _))
def sorted_c(f): return lambda _: sorted(_, key=f)
def slice_c(start, stop, step=1): return lambda l: l[slice(start, stop, step)]

In [142]:
# EPCのトップページをダウンロード（ダウンロード済みの場合はスキップ）
if not os.path.isfile("./epc_html/index.html"):
    resource = urllib.request.urlopen(
        "https://www.epo.org//law-practice/legal-texts/html/epc/2020/e/ma1.html")
    content = resource.read().decode("UTF-8")
    with open("./epc_html/index.html", "w", encoding='UTF-8') as f:
        f.write(content)

# トップページのスクレイピング（article_listの生成）
f = open("./epc_html/index.html", "r", encoding="UTF-8")
soup = BeautifulSoup(f.read())
article_list = chain(
    soup.select_one("div.DOC4NET2_Section_l0_0em_r0_0em").select(
        "div.paraBlock")[2:],
    map_c(lambda x: x.select("a")),
    filter_c(lambda x: len(x) == 2),
    map_c(lambda x: [x[0].get_text(), x[1].get_text(), x[0].get("href")])
)
f.close()


In [145]:
# articleページのダウンロード
base_url = "https://www.epo.org//law-practice/legal-texts/html/epc/2020/e/"
for art in article_list:
    clear_output()
    print(art[0])
    # ダウンロード済みの場合はスキップ
    if not os.path.isfile(f"./epc_html/{art[2]}"):
        resource = urllib.request.urlopen(
            f"{base_url}{art[2]}")
        content = resource.read().decode("UTF-8")
        with open(f"./epc_html/{art[2]}", "w", encoding='UTF-8') as f:
            f.write(content)
        sleep(0.5)


Art. 178


In [148]:
# 各artcleページからの情報抽出
def extract_article_html(art):
    file_path = f"./epc_html/{art[2]}"
    f = open(file_path, "r", encoding="UTF-8")
    soup = BeautifulSoup(f.read(), from_encoding='UTF-8')
    art_body = str(soup.select_one("div#pagebody"))
    html = f"<h3>{art[0]} - {art[1]}</h3>{art_body}"
    f.close()
    return html


html_list = list(map(extract_article_html, article_list))



In [149]:
whole_html = f"<html><head><meta http-equiv='Content-Type' content='text/html' charset='UTF-8'/></head><body>{''.join(html_list)}</body></html>"
with open("./epc_html/whole.html", "w", encoding='UTF-8') as f:
    f.write(whole_html)


In [364]:
# whole.htmlをスクレイピングする
f = open("./epc_html/whole.html", "r", encoding="UTF-8")
soup = BeautifulSoup(f.read())
f.close()

pagebodies = soup.select("div#pagebody")
info = []
for body in pagebodies:
    title_info = body.select_one(".LMArtReg").decode_contents().split("<br/>")
    art_num = title_info[0].split("\xa0")[1].split("<")[0]
    art_html = title_info[0]
    title = title_info[1]
    if body.select_one(".DOC4NET2-references"):
        ref_num = re.sub("<[^>]+>", "", str(body.select_one(
            ".DOC4NET2-references p.Margin").decode_contents()).replace("<br/>", ", "))
    else:
        ref_num = ""
    LMNormal = chain(
        body.select_one(".wpsPortletBody").select(".LMNormal"),
        map_c(lambda x: x.decode_contents())
    )[:1] # Art. 178の最後の文章を切るために1以降の要素を削除
    paraBlock = body.select_one(".wpsPortletBody").select(".paraBlock")
    paraBlock_contents = []
    paraBlock_footnotes = []
    for j in paraBlock:
        if j.select_one(".DOC4NET2_pos_FootnoteText"):
            html = j.decode_contents()
            if (mt := re.search(r'<a name="([^"]+)">', html)):
                id = mt.group(1)
            else:
                id = ""

            if (mt := re.search(r'<span class="FootnoteReference">(.*)</span>', html)):
                ref_html = mt.group(1).replace("\xa0", "")
            else:
                ref_html = ""

            if (mt := re.search(r'<div class="DOC4NET2_pos_FootnoteText_1">(.*)', html)):
                content = mt.group(1)[:-12]
            else:
                content = ""
            
            paraBlock_footnotes.append({"id": id, "ref_html": ref_html, "content": content})
        else:
            html = j.decode_contents()
            # print(html)
            if (mt := re.search(r'<a name="([^"]+)">', html)):
                id = mt.group(1)
            else:
                id = ""

            if (mt := re.search(r'<a name="[^"]+"></a>\(([^)]+)\)', html)):
                item_string = mt.group(1)
            else:
                item_string = ""

            if (mt := re.search(r'<div class="DOC4NET2_pos_LMNormal_1">(.*)', html)):
                content = mt.group(1)[:-12]
            else:
                content = ""

            # print({"id": id, "item_string": item_string, "content": content})
            paraBlock_contents.append({"id": id, "item_string": item_string, "content": content})

    info.append({"art_num": art_num, "art_html": art_html,
                 "title": title, "ref_num": ref_num, "LMNormal": LMNormal,
                 "paraBlock_contents": paraBlock_contents, "paraBlock_footnotes": paraBlock_footnotes})


In [365]:
# スクレイピングした結果をHTMLに変換する
def makeArticleHTML(info):

    LMNormal = info["LMNormal"][0] if info["LMNormal"] else ""
    paraBlock_contents = "".join(chain(
        info["paraBlock_contents"],
        map_c(lambda x: f"<div id='{x['id']}'>({x['item_string']}) {x['content']}</div>")
    ))
    paraBlock_footnotes = "".join(chain(
        info["paraBlock_footnotes"],
        map_c(lambda x: f"<div id='{x['id']}'>{x['ref_html']} {x['content']}</div>")
    ))

    html = f"""
        <div id="ar{info["art_num"]}">
            <h3>{info["art_html"]} - {info["title"]}</h3>
            <div class="LMNormal">{LMNormal}</div>
            <div class="paraBlock_contents">{paraBlock_contents}</div>
            <div class="paraBlock_footnotes">{paraBlock_footnotes}</div>
        </div>
    """
    return html

body = "".join(chain(
    info,
    map_c(makeArticleHTML)
))

html = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <link rel="stylesheet" href="./output.css">
        <title>EPC output</title>
    </head>
    <body>
        {body}
    </body>
    </html>
"""

with open("./epc_html/output.html", "w", encoding="UTF-8") as f:
        f.write(html)
    
