## インターネット上のデータを扱う

- [urllib.parse --- URL を構成要素に解析する](https://docs.python.org/ja/3.13/library/urllib.parse.html)


### URL をパースする - urllib.parse


In [1]:
from urllib import parse

result = parse.urlparse("https://akagikouzanh.tech/")
result

ParseResult(scheme='https', netloc='akagikouzanh.tech', path='/', params='', query='', fragment='')

In [2]:
result.geturl()

'https://akagikouzanh.tech/'

In [3]:
result.scheme

'https'

In [4]:
result[0]

'https'

In [5]:
result.hostname

'akagikouzanh.tech'

In [6]:
result.path

'/'

In [8]:
# 「#」行こうが帰ってくる(今回はない)
result.fragment

''

In [10]:
# ;移行の文字列(今回はない)
result.params

''

In [16]:
# クエリ文字をパースする
result = parse.urlparse(
    "https://www.google.co.jp/search?q=python&oq=python&sourceid=chrome&ie=UTF-8"
)

In [17]:
result.query

'q=python&oq=python&sourceid=chrome&ie=UTF-8'

In [18]:
parse.parse_qs(result.query)

{'q': ['python'], 'oq': ['python'], 'sourceid': ['chrome'], 'ie': ['UTF-8']}

In [19]:
parse.parse_qs("key=1&key=2")

{'key': ['1', '2']}

In [20]:
parse.parse_qsl(result.query)

[('q', 'python'), ('oq', 'python'), ('sourceid', 'chrome'), ('ie', 'UTF-8')]

In [21]:
parse.parse_qsl("key=1&key=2")

[('key', '1'), ('key', '2')]

In [24]:
parse.parse_qs("key1=&key2=2")

{'key2': ['2']}

In [25]:
parse.parse_qs("key1=&key2=2", keep_blank_values=True)

{'key1': [''], 'key2': ['2']}

### クエリ文字を組み立てる - urlencode()


In [26]:
parse.urlencode({"key1": 1, "key2": 2, "key3": "ぱいそん"})

'key1=1&key2=2&key3=%E3%81%B1%E3%81%84%E3%81%9D%E3%82%93'

In [27]:
parse.urlencode([("key1", 1), ("key2", 2), ("key3", "ぱいそん")])

'key1=1&key2=2&key3=%E3%81%B1%E3%81%84%E3%81%9D%E3%82%93'

In [30]:
# foo, barは文字列扱い
query = {"key1": 1, "key2": ["foo", "bar"]}
parse.urlencode(query)

'key1=1&key2=%5B%27foo%27%2C+%27bar%27%5D'

In [31]:
# 1つのキーに複数の値が存在すると解釈される
parse.urlencode(query, doseq=True)

'key1=1&key2=foo&key2=bar'

In [32]:
query = {"key1": " "}
parse.urlencode(query)

'key1=+'

In [33]:
parse.urlencode(query, quote_via=parse.quote)

'key1=%20'

### URL として使用できる文字列に変換 - quote(), quote_plus()


In [34]:
url = "https://ja.wikipedia.org/wiki/パイソン"
parse.quote(url)

'https%3A//ja.wikipedia.org/wiki/%E3%83%91%E3%82%A4%E3%82%BD%E3%83%B3'

In [35]:
parse.quote_plus(url)

'https%3A%2F%2Fja.wikipedia.org%2Fwiki%2F%E3%83%91%E3%82%A4%E3%82%BD%E3%83%B3'

In [36]:
parse.quote("_.-~")

'_.-~'

In [39]:
# 3.6以前はチルダをエンコードされないようにするにはsafeを指定する
parse.quote_plus("_.-~", safe="/~")

'_.-~'

### URL を結合する - urljoin()


In [41]:
parse.urljoin("https://ja.wikipedia.org", "/wiki/Python")

'https://ja.wikipedia.org/wiki/Python'

In [44]:
parse.urljoin("https://ja.wikipedia.org/wiki/Python", "#ライブラリ")

'https://ja.wikipedia.org/wiki/Python#ライブラリ'

In [45]:
# 相対パスにするとそれを埋めるように設定される
parse.urljoin("https://ja.wikipedia.org/test/path", "../../wiki/Python")

'https://ja.wikipedia.org/wiki/Python'

In [46]:
parse.urljoin("https://www.python.org/", "https://www.example.com")

'https://www.example.com'

In [47]:
qs = "q=python&oq=python;sourceid=chrome&ie=UTF-8"
parse.parse_qs(qs)

{'q': ['python'], 'oq': ['python;sourceid=chrome'], 'ie': ['UTF-8']}

In [50]:
# バージョンによっては引数使用ができないため、注意が必要
parse.parse_qs(qs, separator=";")

{'q': ['python&oq=python'], 'sourceid': ['chrome&ie=UTF-8']}

In [49]:
parse.parse_qs(qs, separator="&;")

{'q': ['python&oq=python;sourceid=chrome&ie=UTF-8']}

## URL を開く - urllib.request


In [55]:
from urllib import request

with request.urlopen("https://httpbin.org/get") as f:
    res = f.read()[:92]

res

b'{\n  "args": {}, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Host": "httpbin.org'

In [57]:
file_data = request.urlopen("https://httpbin.org/image/jpeg").read()
with open("tmpfiles/test.jpg", "wb") as f:
    f.write(file_data)

In [58]:
data = "key1=value1&key2=value2"
res = request.urlopen("https://httpbin.org/post", data=data.encode())
res.status

200

### GET, POST 以外の HTTP メソッドを扱う

- [class urllib.request.Request](https://docs.python.org/ja/3.13/library/urllib.request.html#urllib.request.Request)


In [73]:
# 基本はtry except使って正しく処理する
req = request.Request("https://httpbin.org/delete", data=data.encode(), method="DELETE")
with request.urlopen(req) as f:
    res_body = f.read()[:110]
    res_status = res.status

print(res_status)
print(res_body)

200
b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "key1": "value1", \n    "key2": "value2"\n  }, \n'


In [106]:
headers = {"Accept": "application/json"}
res = request.Request("https://httpbin.org/get", headers=headers)
res.__dict__

{'_full_url': 'https://httpbin.org/get',
 'fragment': None,
 'type': 'https',
 'host': 'httpbin.org',
 'selector': '/get',
 'headers': {'Accept': 'application/json'},
 'unredirected_hdrs': {},
 '_data': None,
 '_tunnel_host': None,
 'origin_req_host': 'httpbin.org',
 'unverifiable': False}

In [78]:
print(res.headers)

{'Accept': 'application/json'}


## Base16, Base64 などへエンコードする - base64


In [114]:
import base64

s = "Pythonは簡単に学習でき、それでいて強力な言語の一つです。"
base64.b64encode(s)

TypeError: a bytes-like object is required, not 'str'

In [122]:
bs = base64.b64encode(s.encode())
bs

b'UHl0aG9u44Gv57Ch5Y2Y44Gr5a2m57+S44Gn44GN44CB44Gd44KM44Gn44GE44Gm5by35Yqb44Gq6KiA6Kqe44Gu5LiA44Gk44Gn44GZ44CC'

In [None]:
base64.b64encode(s.encode(), altchars=b"@*")

b'UHl0aG9u44Gv57Ch5Y2Y44Gr5a2m57@S44Gn44GN44CB44Gd44KM44Gn44GE44Gm5by35Yqb44Gq6KiA6Kqe44Gu5LiA44Gk44Gn44GZ44CC'

In [123]:
base64.b64decode(bs).decode()

'Pythonは簡単に学習でき、それでいて強力な言語の一つです。'

In [130]:
data = "key1=value1&key2=value2"

with request.urlopen("https://httpbin.org/post", data=data.encode()) as req:
    print(req.status)
    print(req.read()[:100])

200
b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "key1": "value1", \n    "key2": "valu'
