# Crawler -- links

Date: 2024/05/02-2024/05/06

In [1]:
%run ./lib.py

In [2]:
import requests
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import re
import sqlite3

In [3]:
with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    cur.execute('DROP TABLE IF EXISTS links')
    cur.execute('CREATE TABLE links (link_id INTEGER PRIMARY KEY AUTOINCREMENT, path TEXT, title TEXT, source_id INTEGER, UNIQUE(path, title, source_id), FOREIGN KEY(source_id) REFERENCES sources(source_id))')

## 経済産業省　通商白書

In [4]:
url_meti = "https://www.meti.go.jp/report/tsuhaku2023/whitepaper_2023.html"

with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    source_id = cur.execute('SELECT source_id FROM sources WHERE org="経済産業省" AND doc="通商白書 2023"').fetchone()[0]

source_id

1

In [5]:
resp = requests.get(url_meti)
html_doc = resp.content.decode('utf-8')

In [6]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
chapter1 = [tag for tag in soup.find_all(string='第Ⅰ部　岐路に立たされる世界経済')]

In [8]:
all_a = chapter1[0].find_all_next("a", href=re.compile(r'^.*\d?-\d?-\d?\.pdf$'))

# [[url, title], ...]
links = [[a['href'], a.text] for a in all_a]

In [9]:
with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    for path, title in links:
        cur.execute(f'INSERT INTO links (path, title, source_id) VALUES(?, ?, ?)', (path, title, source_id))

## 総務省　情報通信白書

In [10]:
url_soumu = "https://www.soumu.go.jp/johotsusintokei/whitepaper/ja/r05/pdf/index.html"

with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    source_id = cur.execute('SELECT source_id FROM sources WHERE org="総務省" AND doc="情報通信白書 2023"').fetchone()[0]

source_id

2

In [11]:
resp = requests.get(url_soumu)

In [12]:
# Reference: https://stackoverflow.com/questions/7219361/python-and-beautifulsoup-encoding-issues
from bs4.dammit import EncodingDetector
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
html_encoding

'shift_jis'

In [13]:
soup = BeautifulSoup(resp.content.decode(html_encoding))
all_a = soup.find_all('a', href=re.compile(r'^n\d\d00000\.pdf$'))
links = [[a['href'], a.text] for a in all_a]
sorted(links)



[['n1100000.pdf', 'データ流通を支える通信インフラの高度化'],
 ['n1200000.pdf', 'データ流通とデジタルサービスの進展'],
 ['n2100000.pdf', '加速するデータ流通とデータ利活用'],
 ['n2200000.pdf', 'プラットフォーマーへのデータの集中'],
 ['n2300000.pdf', 'インターネット上での偽・誤情報の拡散等'],
 ['n3100000.pdf', 'データ流通・活用の新たな潮流'],
 ['n3200000.pdf', '豊かなデータ流通社会の実現に向けて'],
 ['n4100000.pdf', 'ICT産業の動向'],
 ['n4200000.pdf', '電気通信分野の動向'],
 ['n4300000.pdf', '放送・コンテンツ分野の動向'],
 ['n4400000.pdf', '我が国の電波の利用状況'],
 ['n4500000.pdf', '国内外におけるICT機器・端末関連の動向'],
 ['n4600000.pdf', 'プラットフォームの動向'],
 ['n4700000.pdf', 'ICTサービス及びコンテンツ・アプリケーションサービス市場の動向'],
 ['n4800000.pdf', 'データセンター市場及びクラウドサービス市場の動向'],
 ['n4900000.pdf', 'AIの動向'],
 ['n5100000.pdf', '総合的なICT政策の推進'],
 ['n5200000.pdf', '電気通信事業政策の動向'],
 ['n5300000.pdf', '電波政策の動向'],
 ['n5400000.pdf', '放送政策の動向'],
 ['n5500000.pdf', 'サイバーセキュリティ政策の動向'],
 ['n5600000.pdf', 'ICT利活用の推進'],
 ['n5700000.pdf', 'ICT技術政策の動向'],
 ['n5800000.pdf', 'ICT国際戦略の推進'],
 ['n5900000.pdf', '郵政行政の推進']]

In [14]:
with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    for path, title in links:
        cur.execute(f'INSERT INTO links (path, title, source_id) VALUES(?, ?, ?)', (path, title, source_id))

## 防衛省　防衛白書

In [15]:
url_mod = 'http://www.clearing.mod.go.jp/hakusho_data/2023/pdf/index.html'

with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    source_id = cur.execute('SELECT source_id FROM sources WHERE org="防衛省" AND doc="防衛白書 2023"').fetchone()[0]

source_id

3

In [16]:
resp = requests.get(url_mod)

In [17]:
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
html_encoding

'utf-8'

In [18]:
soup = BeautifulSoup(resp.content.decode(html_encoding))
all_a = soup.find_all('a', href=re.compile(r'^R05\d{6}\.pdf$'))
links = [[a['href'], a.text] for a in all_a]
sorted(links)

[['R05000010.pdf', '刊行によせて'],
 ['R05000021.pdf', '激変する時代～10年の変化～'],
 ['R05000022.pdf', '国家防衛戦略'],
 ['R05000031.pdf', 'わが国を取り巻く安全保障環境'],
 ['R05000032.pdf', 'わが国の安全保障・防衛政策'],
 ['R05000033.pdf', '防衛目標を実現するための3つのアプローチ'],
 ['R05000034.pdf', '共通基盤などの強化'],
 ['R05010100.pdf', '概観'],
 ['R05010200.pdf', 'ロシアによるウクライナ侵略とウクライナによる防衛'],
 ['R05010301.pdf', '米国'],
 ['R05010302.pdf', '中国'],
 ['R05010303.pdf', '米国と中国の関係など'],
 ['R05010304.pdf', '朝鮮半島'],
 ['R05010305.pdf', 'ロシア'],
 ['R05010306.pdf', '大洋州'],
 ['R05010307.pdf', '東南アジア'],
 ['R05010308.pdf', '南アジア'],
 ['R05010309.pdf', '欧州・カナダ'],
 ['R05010310.pdf', 'その他の地域など（中東・アフリカを中心に）'],
 ['R05010401.pdf', '情報戦などにも広がりをみせる科学技術をめぐる動向'],
 ['R05010402.pdf', '宇宙領域をめぐる動向'],
 ['R05010403.pdf', 'サイバー領域をめぐる動向'],
 ['R05010404.pdf', '電磁波領域をめぐる動向'],
 ['R05010405.pdf', '海洋をめぐる動向'],
 ['R05010406.pdf', '大量破壊兵器の移転・拡散'],
 ['R05010407.pdf', '気候変動が安全保障環境や軍に与える影響'],
 ['R05020101.pdf', 'わが国の安全保障を確保する方策'],
 ['R05020102.pdf', '憲法と防衛政策の基本'],
 ['R05020103.pdf', 'わが国の安全保障政策の体系'],
 [

In [19]:
with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    for path, title in links:
        cur.execute(f'INSERT INTO links (path, title, source_id) VALUES(?, ?, ?)', (path, title, source_id))

## 横浜市　中期計画

In [20]:
url_yokohama = "https://www.city.yokohama.lg.jp/city-info/seisaku/hoshin/4kanen/2022-2025/chuki2022-.html"

with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    source_id = cur.execute('SELECT source_id FROM sources WHERE org="横浜市" AND doc="横浜市中期計画 2022～2025"').fetchone()[0]

source_id

4

In [21]:
resp = requests.get(url_yokohama)

In [22]:
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
html_encoding

'utf-8'

In [23]:
soup = BeautifulSoup(resp.content.decode(html_encoding))

In [24]:
head = [tag for tag in soup.find_all(string='冊子・分割ダウンロード')]
head

['冊子・分割ダウンロード']

In [25]:
all_a = []
n = head[0].find_next('a')
all_a.append(n)
while True:
    n = n.find_next('a', class_='pdf')
    if n is None:
        break
    else:
        all_a.append(n)

all_a

[<a class="pdf" href="chuki2022-.files/0070_20230213.pdf">表紙・裏表紙（PDF：371KB）</a>,
 <a class="pdf" href="chuki2022-.files/0071_20230213.pdf">はじめに（PDF：955KB）</a>,
 <a class="pdf" href="chuki2022-.files/0023_20230118.pdf">目次（PDF：878KB）</a>,
 <a class="pdf" href="chuki2022-.files/0024_20230118.pdf">Ⅰ　中期計画の特徴（PDF：2,137KB）</a>,
 <a class="pdf" href="chuki2022-.files/0072_20230213.pdf">Ⅱ　共にめざす都市像（PDF：5,920KB）</a>,
 <a class="pdf" href="chuki2022-.files/0026_20230118.pdf">Ⅲ　基本戦略（PDF：6,326KB）</a>,
 <a class="pdf" href="chuki2022-.files/0027_20230118.pdf">Ⅳ　基本姿勢（PDF：2,039KB）</a>,
 <a class="pdf" href="chuki2022-.files/0073_20230213.pdf">Ⅴ　９つの戦略及び３８の政策（PDF：13,539KB）</a>,
 <a class="pdf" href="chuki2022-.files/0029_20230118.pdf">Ⅵ　行財政運営（PDF：3,513KB）</a>,
 <a class="pdf" href="chuki2022-.files/0030_20230118.pdf">Ⅶ　大都市制度（PDF：2,500KB）</a>,
 <a class="pdf" href="chuki2022-.files/0031_20230118.pdf">Ⅷ　DXの推進（PDF：1,705KB）</a>,
 <a class="pdf" href="chuki2022-.files/0074_20230213.pdf">Ⅸ　計画の前提（PDF：7,225KB）</

In [26]:
links = [[a['href'], a.text] for a in all_a]
links

[['chuki2022-.files/0070_20230213.pdf', '表紙・裏表紙（PDF：371KB）'],
 ['chuki2022-.files/0071_20230213.pdf', 'はじめに（PDF：955KB）'],
 ['chuki2022-.files/0023_20230118.pdf', '目次（PDF：878KB）'],
 ['chuki2022-.files/0024_20230118.pdf', 'Ⅰ\u3000中期計画の特徴（PDF：2,137KB）'],
 ['chuki2022-.files/0072_20230213.pdf', 'Ⅱ\u3000共にめざす都市像（PDF：5,920KB）'],
 ['chuki2022-.files/0026_20230118.pdf', 'Ⅲ\u3000基本戦略（PDF：6,326KB）'],
 ['chuki2022-.files/0027_20230118.pdf', 'Ⅳ\u3000基本姿勢（PDF：2,039KB）'],
 ['chuki2022-.files/0073_20230213.pdf', 'Ⅴ\u3000９つの戦略及び３８の政策（PDF：13,539KB）'],
 ['chuki2022-.files/0029_20230118.pdf', 'Ⅵ\u3000行財政運営（PDF：3,513KB）'],
 ['chuki2022-.files/0030_20230118.pdf', 'Ⅶ\u3000大都市制度（PDF：2,500KB）'],
 ['chuki2022-.files/0031_20230118.pdf', 'Ⅷ\u3000DXの推進（PDF：1,705KB）'],
 ['chuki2022-.files/0074_20230213.pdf', 'Ⅸ\u3000計画の前提（PDF：7,225KB）'],
 ['chuki2022-.files/0033_20230118.pdf', 'コラム（抜粋）（PDF：1,800KB）'],
 ['chuki2022-.files/0034_20230118.pdf', '計画の策定経過（PDF：2,287KB）'],
 ['chuki2022-.files/0067_20230125.pdf', 'リーフレット（タ

In [27]:
with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    for path, title in links:
        cur.execute(f'INSERT INTO links (path, title, source_id) VALUES(?, ?, ?)', (path, title, source_id))

## 川崎市　総合計画

In [28]:
url_kawasaki = "https://www.city.kawasaki.jp/170/page/0000138364.html"

with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    source_id = cur.execute('SELECT source_id FROM sources WHERE org="川崎市" AND doc="川崎市総合計画 2022〜2025"').fetchone()[0]

source_id

5

In [29]:
resp = requests.get(url_kawasaki)

In [30]:
html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
html_encoding

'utf-8'

In [31]:
soup = BeautifulSoup(resp.content.decode(html_encoding))

In [32]:
all_a = soup.find_all('a', href=re.compile(r'\.pdf$'))
all_a

[<a href="../cmsfiles/contents/0000138/138364/1_souron1.pdf" target="_blank"><img alt="" class="icon" height="24" src="../images/pdf.gif" width="22"/>川崎市総合計画　第3期実施計画（目次・総論(1)）(PDF形式, 8.78MB)<span class="window">別ウィンドウで開く</span></a>,
 <a href="../cmsfiles/contents/0000138/138364/2_souron2.pdf" target="_blank"><img alt="" class="icon" height="24" src="../images/pdf.gif" width="22"/>川崎市総合計画　第3期実施計画（総論(2)）(PDF形式, 9.46MB)<span class="window">別ウィンドウで開く</span></a>,
 <a href="../cmsfiles/contents/0000138/138364/3_souron3.pdf" target="_blank"><img alt="" class="icon" height="24" src="../images/pdf.gif" width="22"/>川崎市総合計画　第3期実施計画（総論(3)）(PDF形式, 6.56MB)<span class="window">別ウィンドウで開く</span></a>,
 <a href="../cmsfiles/contents/0000138/138364/4_10nensenryaku.pdf" target="_blank"><img alt="" class="icon" height="24" src="../images/pdf.gif" width="22"/>川崎市総合計画　第3期実施計画（かわさき10年戦略）(PDF形式, 5.67MB)<span class="window">別ウィンドウで開く</span></a>,
 <a href="../cmsfiles/contents/0000138/138364/5_seisaku1.pdf" targe

In [33]:
links = [[a['href'], a.text.replace('別ウィンドウで開く', '')] for a in all_a]
links

[['../cmsfiles/contents/0000138/138364/1_souron1.pdf',
  '川崎市総合計画\u3000第3期実施計画（目次・総論(1)）(PDF形式, 8.78MB)'],
 ['../cmsfiles/contents/0000138/138364/2_souron2.pdf',
  '川崎市総合計画\u3000第3期実施計画（総論(2)）(PDF形式, 9.46MB)'],
 ['../cmsfiles/contents/0000138/138364/3_souron3.pdf',
  '川崎市総合計画\u3000第3期実施計画（総論(3)）(PDF形式, 6.56MB)'],
 ['../cmsfiles/contents/0000138/138364/4_10nensenryaku.pdf',
  '川崎市総合計画\u3000第3期実施計画（かわさき10年戦略）(PDF形式, 5.67MB)'],
 ['../cmsfiles/contents/0000138/138364/5_seisaku1.pdf',
  '川崎市総合計画\u3000第3期実施計画（基本政策1）(PDF形式, 6.66MB)'],
 ['../cmsfiles/contents/0000138/138364/6_seisaku2.pdf',
  '川崎市総合計画\u3000第3期実施計画（基本政策2）(PDF形式, 5.45MB)'],
 ['../cmsfiles/contents/0000138/138364/7_seisaku3.pdf',
  '川崎市総合計画\u3000第3期実施計画（基本政策3）(PDF形式, 7.18MB)'],
 ['../cmsfiles/contents/0000138/138364/8_seisaku4.pdf',
  '川崎市総合計画\u3000第3期実施計画（基本政策4）(PDF形式, 9.61MB)'],
 ['../cmsfiles/contents/0000138/138364/9_seisaku5.pdf',
  '川崎市総合計画\u3000第3期実施計画（基本政策5）(PDF形式, 3.74MB)'],
 ['../cmsfiles/contents/0000138/138364/10_kuke

In [34]:
with sqlite3.connect(DB_PATH) as conn:
    cur = conn.cursor()
    for path, title in links:
        cur.execute(f'INSERT INTO links (path, title, source_id) VALUES(?, ?, ?)', (path, title, source_id))