-
Notifications
You must be signed in to change notification settings - Fork 0
/
api_crawler.py
136 lines (116 loc) · 4.17 KB
/
api_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import json
import sys
from pathlib import Path
from typing import Dict, List, Optional, Set
import requests
import common
class WikipediaLinkCrawler:
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"prop": "links",
"pllimit": "max",
}
def __init__(self, cache_file: Optional[str] = None):
self.session = requests.Session()
self.cache_file = cache_file
if cache_file:
self._read_cache(Path(cache_file))
else:
self.cache: Dict[str, List[str]] = {}
def close(self) -> None:
if self.cache_file:
with open(self.cache_file, "w", encoding="utf-8") as file:
json.dump(self.cache, file)
def _read_cache(self, cache_file: Path):
if not cache_file.exists():
cache_file.write_text("{}", encoding="utf-8")
with open(cache_file, encoding="utf-8") as file:
self.cache = json.load(file)
def get_cached_links(self, title: str) -> List[str]:
return self.cache.get(title, [])
def write_cached_links(self, title: str, links: List[str]):
self.cache[title] = links
def should_include_title(self, title: str):
return not common.is_namespace_title(title)
# adapted from https://stackoverflow.com/a/57983365
def get_linked_titles(self, title: str) -> List[str]:
def add_links(links: List[str], pages: Dict[str, Dict]):
for page in pages.values():
if "missing" in page:
continue
for link in page["links"]:
if self.should_include_title(link["title"]):
links.append(link["title"])
links = self.get_cached_links(title)
if links:
return links
params = {**self.params, "titles": title}
response = self.session.get(url=self.url, params=params)
data = response.json()
pages = data["query"]["pages"]
add_links(links, pages)
while "continue" in data:
plcontinue = data["continue"]["plcontinue"]
params["plcontinue"] = plcontinue
response = self.session.get(url=self.url, params=params)
data = response.json()
pages = data["query"]["pages"]
add_links(links, pages)
self.write_cached_links(title, links)
return links
class PathToCrawler(WikipediaLinkCrawler):
def _crawl(
self,
path: List[str],
visited: Set[str],
origin_title: str,
target_title: str,
limit: int,
depth: int = 0,
) -> bool:
print(f'crawling "{origin_title}"', file=sys.stderr)
if origin_title == target_title:
return True
elif origin_title in visited:
return False
visited.add(origin_title)
if depth + 1 > limit:
return False
titles = self.get_linked_titles(origin_title)
for linked_title in titles:
if self._crawl(
path, visited, linked_title, target_title, limit, depth=depth + 1
):
path.append(linked_title)
return True
return False
def path_to(self, origin_title: str, target_title: str, limit=12) -> List[str]:
path: List[str] = []
visited: Set[str] = set()
success = self._crawl(path, visited, origin_title, target_title, limit)
if success:
path.append(origin_title)
path.reverse()
return path
class PhilosophyCrawler(PathToCrawler):
def path_to_philosophy(self, origin_title: str, limit=12) -> List[str]:
return self.path_to(origin_title, "Philosophy", limit)
if __name__ == "__main__":
titles = [
"Wikipedia:Getting_to_Philosophy",
"Foobar",
"C (programming language)",
# "Albert Einstein",
# "Logic",
# "Philosophy",
# "Wikipedia",
]
try:
philosophy_crawler = PhilosophyCrawler("cached_links.json")
for title in titles:
path = philosophy_crawler.path_to_philosophy(title)
print(path)
finally:
philosophy_crawler.close()