In [None]:
import pandas as pd
import re
import pprint
path = 'data/chap3/jawiki-country.json.gz'

### 20. JSONデータの読み込み

In [None]:
df = pd.read_json(path, lines=True)
# values 要素だけを出してる
uk = df.query('title=="イギリス"')['text'].values[0]
print(uk[:1000])

### 21. カテゴリ名を含む行を抽出

In [None]:
ukList = uk.split('\n')
ans = list(filter(lambda x: 'Category:' in x, ukList))
print(ans)

### 22. カテゴリ名の抽出

In [None]:
category = list(filter(lambda x: 'Category:' in x, ukList))
ans = [c.replace('[[','').replace('Category:','').replace('|*', '').replace(']]','') for c in category]
print(ans)

### 23. セクション構造

In [None]:
for line in ukList:
    if re.search(r'^=+.*=+$', line) :
        level = line.count('=') // 2 - 1
        print(line.replace('=', ''), level)

### 24. ファイル参照の抽出

In [None]:
for line in ukList:
    r = re.findall('File|ファイル:(.+?)\|',line) #*は0回以上なのでダメ
    if r :
        print(r[0])

### 25. テンプレートの抽出

In [None]:
p1 = re.compile(r'\{\{基礎情報')
p2 = re.compile('\|')
p3 = re.compile('^\}\}')
fg = False
lst = []
dct = {}
for line in ukList:
    if fg:
        if p3.match(line):
            break
        if p2.match(line):
            lst.append(line) # 先頭だけ置き換える
    if p1.match(line):
        fg = True
# ワンライナーでかけないかな
for l in lst:
    a = re.search('\|(.+?)\s=\s?(.+)',l)
    dct[a[1]] = a[2]
print(dct)

### 26. 強調マークアップの除去

In [None]:
def remove_stress(v):
    r = re.compile("'+")
    v = r.sub('', v)
    return v

In [None]:
p1 = re.compile(r'\{\{基礎情報')
p2 = re.compile('\|')
p3 = re.compile('^\}\}')
fg = False
lst = []
dct = {}
for line in ukList:
    if fg:
        if p3.match(line):
            break
        if p2.match(line):
            lst.append(line) 
    if p1.match(line):
        fg = True
for l in lst:
    a = re.search('\|(.+?)\s=\s?(.+)',l)
    dct[a[1]] = remove_stress(a[2]) # ここ
print(dct)

### 27. 内部リンクの除去

In [None]:
def remove_link(v):
    r = re.compile("\[\[(.+?\||)(.+?)\]\]")
    v = r.sub(r'\2', v)
    return v

In [None]:
p1 = re.compile(r'\{\{基礎情報')
p2 = re.compile('\|')
p3 = re.compile('^\}\}')
fg = False
lst = []
dct = {}
for line in ukList:
    if fg:
        if p3.match(line):
            break
        if p2.match(line):
            lst.append(line) 
    if p1.match(line):
        fg = True
for l in lst:
    a = re.search('\|(.+?)\s=\s?(.+)',l)
    dct[a[1]] = remove_link(remove_stress(a[2])) # ここ
pprint.pprint(dct)

### 28. MediaWikiマークアップの除去

In [None]:
def remove_mk(v):    
    p1 = re.compile("<(ref|br)(\s|>).+?(</ref>|$)")
    p2 = re.compile('\{\{(.+\||)(.+?)\}\}')
    v = p1.sub('', v)
    v = p2.sub('\\2',v)
    v = remove_link(remove_stress(v)) # 26, 27
    
    return v

In [None]:
p1 = re.compile(r'\{\{基礎情報')
p2 = re.compile('\|')
p3 = re.compile('\}\}')
fg = False
lst = []
dct = {}
for line in ukList:
    if fg:
        if p3.match(line):
            break
        if p2.match(line):
            lst.append(line) 
    if p1.match(line):
        fg = True
for l in lst:
    a = re.search('\|(.+?)\s=\s?(.+)',l)
    dct[a[1]] = remove_mk(a[2]) # ここ
pprint.pprint(dct)

### 29. 国旗画像のURLを取得する

In [None]:
import requests
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "File:" + dct['国旗画像'],
    "prop": "images"
}
R = S.get(url=URL, params=PARAMS)
DATA = R.json()

PAGES = DATA['query']['pages']

for k, v in PAGES.items():
    for img in v['images']:
        print(img["title"])