In [1]:
# this notebook will use beautiful soup to extract text data from html files
# each file will have the title as the first line and each paragraph as new lines

In [26]:
from bs4 import BeautifulSoup
import glob

In [27]:
def get_soup_from_html(file_path):
    with open(file_path, "r+") as f:
        html = f.read()
    soup = soup = BeautifulSoup(html, 'html.parser')
    return soup

In [32]:
def parse_chapter_html(html_file_path) -> (str, []):
    soup = get_soup_from_html(html_file_path)
    title = soup.find('h1', {'id': 'nr_title'}).text
    main_div = soup.find('div', {'id': 'nr1'})
    paragraphs = main_div.findChildren('p')
    paragraphs_text = [p.text for p in paragraphs]
    return title, paragraphs_text

In [33]:
def get_chapters_for_book(book_path):
    chapters = glob.glob(f"../data/books/{book_path}/html/*")
    chapter_count = len(chapters)+1
    chapters = [f"../data/books/{book_path}/html/{i}_*" for i in range(1, chapter_count)]
    chapters = [glob.glob(pattern) for i, pattern in enumerate(chapters)]
    chapters = [c[0] for c in chapters]
    return chapters

In [38]:
def write_chapter_text_file(book_path):
    chapters = get_chapters_for_book(book_path)
    for chapter in chapters:
        title, paragraphs = parse_chapter_html(chapter)
        with open(f"../data/books/{book_path}/text/{chapter.split('/')[-1]}", "w+") as f:
            f.write(title)
            f.write("\n")
            for p in paragraphs:
                if "鲲" not in p and "弩" not in p:
                    f.write(p)
                    f.write("\n")
                else:
                    print(p)


In [39]:
write_chapter_text_file("three_body")

鲲 + 弩 + 小 + 說 +  k u n n u ~ co m-
🐴 鲲 kun弩nu小 xiao说 shuo = w w w * ku n Nu * co m
鲲·弩^小·说
 鲲 # 弩 # 小 # 说 #   w ww # ku n Nu # co m
🍟 鲲|弩|小|说|w w w |k u n n u | co m|
 +鲲-弩+小-说 ·
🥑 鲲=弩=小=说~w w w =k u n n u = c om
 鲲*弩*小*说ww w_k u n n u_c o m _
💦 鲲 | 弩 | 小 | 说 | w w w |k u n n u | co M|
鲲·弩^小·说
🐢 鲲|弩|小|说|ww w |k u n n u | co M|
 -鲲-弩-小-说w ww ^ k u n n u^ c o m. 
鲲~弩~小~说~w w w -k u n n u - co m 💨
🐬 鲲 = 弩 = 小 = 说~w w w =k u n n u = c om
🐷 鲲`弩`小`说w w w . ku n Nu . c o m .
🍓 鲲 # 弩 # 小 # 說 #  w ww # ku n Nu # Co M
🐆 鲲l弩x小x说s = w w w * ku n Nu * co m
🦀 鲲。弩。小。说。w ww…k u n N u…co m
🤡 鲲`弩-小`说ww w ，K u n N u ，c o m
 🌵 鲲+弩-小+說k u n n u - c o m +
 鲲~弩~小~说~w ww -k u n n u - Co m
💐 鲲l 弩x 小x 说s =  Ww w * k u n n u * co m
👓 鲲·弩+小·说w ww - k u n n u - c om- 
 鲲 # 弩 # 小 # 说 #   w ww # ku n Nu # co m
 鲲+弩+小+说+  w w w ~ k u n n u ~ co m-
鲲^弩^小^说 🐪 w w w*k u n n u*c o m *
🍵 鲲 · 弩 + 小 · 说 w Ww - k u n n u - c om- 


In [40]:
write_chapter_text_file("dark_forest")

🍀 鲲*弩*小*说* W ww … ku n Nu … c om
 +鲲-弩+小-說 🍏 w ww· k u n n u· c om·
🍄 鲲·弩^小·说w w w…k u n N u…c o  m …
鲲~弩~小~说~k u n n u - co m 💨
。鲲。弩。小。说。🍒 w ww…k u n N u…co m
鲲 + 弩 + 小 + 說 +  k u n n u ~ co m-
👻 鲲·弩^小·说w W W…k u n N u…c o  m …
 鲲 # 弩 # 小 # 说 #   w ww # ku n Nu # co m
鲲^弩^小^说…
 鲲`弩-小`说  Ww w # K u n N u # c o m
🐏 鲲*弩*小*说ww w_k u n n u_c o m _
💦 鲲 | 弩 | 小 | 说 | w w w |k u n n u | co M|
 鲲 # 弩 # 小 # 说 #   w ww # ku n Nu # co m
鲲·弩+小·说 - k u n n u - c om 
 鲲*弩*小*说ww w_k u n n u_c o m _
🐷 鲲`弩`小`说w w w . ku n Nu . c o m .
🍄 鲲·弩^小·说w w w…k u n N u…c o  m …
🍎 鲲l弩x小x说s = w w w * ku n Nu * Co m
鲲=弩=小=说
鲲·弩^小·说 🐣 w w w…k u n N u…c O  m …
💄 鲲^弩^小^说 w w w*k u n n u*c o m *
 鲲·弩^小·说  w w w…k u n N u…c O  m …
 鲲*弩*小*说* W ww … ku n Nu … c om
 鲲`弩`小`说w w w . ku n Nu . c o m .
🌽 鲲~弩~小~说~w w w -k u n n u - co m
👓 鲲·弩+小·说w ww - k u n n u - c om- 
 +鲲-弩+小-说  w ww· k u n n u· C om ·
 鲲*弩*小*说 🌳 ww w_k u n n u_c o m _
·鲲·弩…小·说 🍕 w w w_ku n Nu_c o m
鲲*弩*小*说* 🐱 … K u n N u … c om
·鲲·弩·小·说 🦄 w w w_k u n n u_c 

In [41]:
write_chapter_text_file("deaths_end")

 鲲·弩+小·说  w ww - k u n n u - c oM- 
鲲·弩^小·说
鲲`弩-小`说 🌕 Ww w # K u n N u # c o m
🐸 鲲。弩。小。说。w ww…k u n N u…co m
-鲲-弩-小-说w ww ^ k u n n u^ c o m. 🍌
💑 鲲=弩=小=说~w w w =k u n n u = C om
 鲲·弩*小·说  ww w · k u n n u ·  Om
鲲·弩^小·说 🐣 w w w…k u n N u…c O  m …
火龙出水、连发弩和阶梯计划
火龙出水和连发弩没能发挥两级导弹和机关枪的作用，同样，阶梯计划也难以把人类带入宇航新时代，它只是用当时的技术所进行的孤注一掷的努力。
 鲲*弩*小*说* W ww … ku n Nu … c om
🍟 鲲|弩|小|说|w w w |k u n n u | co m|
 鲲|弩|小|说|ww w |k u n n u | co M|
 鲲·弩+小·说  w ww - k u n n u - c oM- 
🐷 鲲`弩`小`说w w w . ku n Nu . c o m .
鲲*弩*小*说* 🐱 … K u n N u … c om
🤡 鲲`弩-小`说
🍵 鲲 · 弩 + 小 · 说 w Ww - k u n n u - c om- 
鲲^弩^小^说…
 鲲 = 弩 = 小 = 说~w w w =k u n n u = c om
-鲲-弩-小-说w ww ^ k u n n u^ c o m. 🍌
 鲲*弩*小*说k u n n u_c o m _
-鲲-弩-小-说w ww ^ k u n n u^ c o m. 🌂
🍇 鲲`弩`小`说w w w . ku n Nu . c o m .
🍄 鲲·弩^小·说w w w…k u n N u…c o  m …
·鲲·弩…小·说 🍕 w w w_ku n Nu_c o m
鲲~弩~小~说~w w w -k u n n u - co m 💨
🍅 鲲*弩*小*说* w WW … K u n N u … c om
🐹 鲲+弩-小+说+ ww w +k u n n u - c o m +
🍋 鲲*弩*小*说ww w_k u n n u_c o m _
·鲲·弩·小·说 🍊 w w w_ku n Nu_c o m
 鲲*弩*小*