-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_texts_tegner.py
48 lines (38 loc) · 1.28 KB
/
get_texts_tegner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import urllib.request
import re
import json
from bs4 import BeautifulSoup
def clean_text(text):
soup = BeautifulSoup(text, 'html.parser')
paragraphs = soup.find_all('p')
# so far the first paragraph usually contains poem text, but it might change in the future
text = paragraphs[0].get_text()
regSpace = re.compile('\s{2,}', flags=re.DOTALL)
text = regSpace.sub('', text)
return text
def get_names():
with open('tegner_saga.txt', 'r', encoding='utf-8') as f:
names = f.read().split('\n')
for i, name in enumerate(names):
names[i] = '_'.join(name.split())
return names
def download_poems(poem_list):
source = 'http://svenskadikter.com/'
temp_arr = {}
for poem in poem_list:
try:
page = urllib.request.urlopen(source+poem)
text = page.read().decode('utf-8')
text = clean_text(text)
temp_arr[' '.join(poem.split('_'))] = text
except:
print('reading page failed')
continue
return temp_arr
def main():
frithiofs_saga_poems = get_names()
poems = download_poems(frithiofs_saga_poems)
with open('tegner_frithiofs_saga_source_text.json', 'w', encoding='utf-8') as file:
json.dump(poems, file)
if __name__ == '__main__':
main()