# Crawler -- texts

Date: 2024/05/06

In [1]:
%run ./lib.py
import sqlite3
import requests

In [2]:
import traceback
import fitz

# Reference: https://stackoverflow.com/questions/67558627/problem-while-joining-two-url-components-with-urllib
# urljoinは使わない方が良い。スラッシュありなしで結果が異なるため。
def joinurl(baseurl, path):
    return '/'.join([baseurl.rstrip('/'), path.lstrip('/')])

extract_text = lambda page: (page.number, page.get_text("text").replace('\n', ''))

with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    cur.execute('DROP TABLE IF EXISTS texts')
    cur.execute('CREATE TABLE texts (link_id INTEGER, page INTEGER, text TEXT, UNIQUE(link_id, page, text), FOREIGN KEY(link_id) REFERENCES links(link_id))')

    links = cur.execute('SELECT * FROM links').fetchall()
    
    total = len(links)
    cnt = 1

    for link in links:
        print(f'{cnt}/{total}', end=' ')
        cnt += 1
        link_id = link[0]
        path = link[1]
        source_id = link[3]
        base_url = cur.execute(f'SELECT base_url FROM sources WHERE source_id = {source_id}').fetchone()[0]
        url = joinurl(base_url, path)
        print(url)
        resp = requests.get(url)
        try:
            doc = fitz.open(stream=resp.content)
            for page in doc:
                page_num, text = extract_text(page)
                cur.execute('INSERT INTO texts (link_id, page, text) VALUES (?, ?, ?)', (link_id, page_num, text))
        except:
            print(url)
            traceback.print_exc()

1/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-1-1.pdf
2/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-1-2.pdf
3/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-1-3.pdf
4/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-1-4.pdf
5/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-1-5.pdf
6/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-2-1.pdf
7/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-2-2.pdf
8/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-2-3.pdf
9/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-3-1.pdf
10/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-3-2.pdf
11/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-3-3.pdf
12/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/1-3-4.pdf
13/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/2-1-1.pdf
14/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/2-1-2.pdf
15/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/2-2-1.pdf
16/169 https://www.meti.go.jp/report/tsuhaku2023/pdf/2-2-2.pdf
1