In [101]:
from bs4 import BeautifulSoup
import csv
import os
import re

def get_article_html(html_path):
    do_print = False
    open_divs = 0
    lines = []
    with open(html_path, 'r', encoding='utf-8') as f:
        for line in f:
            if str(line).startswith('<div itemprop="articleBody">'):
                do_print = True
                open_divs += 1
            if do_print:
                line = re.sub('[¶]', '', line)
                lines.append(line)
                
                if str(line).startswith("<div"):
                    open_divs += 1
                elif str(line).startswith("</div>"):
                    open_divs -= 1
                if open_divs < 1:
                    do_print = False
    return "".join(lines)

def get_html_as_plaintext(html):
    soup = BeautifulSoup(html)
    plaintext = soup.get_text()
    return plaintext

global_cleans = []
# for convenience and effiency, global_cleans is used instead of merging lists
def clean_article_plaintext(text):
    lines = text.split("\n")
    
    i = 0
    n = len(lines)
    
    # skip initial empty lines
    while i < n:
        line = str(lines[i])
        if line and not line.isspace():
            if line.startswith("Attention: Here be dragons"):
                i+=7 # warning spam message is 7 lines long
            else:
                break
        i+=1

    cleaning = []
    skip_empties = 0
    while i < n:
        if skip_empties < 1:
            cleaning.append(lines[i])
        else:
            skip_empties -= 1
        
        # skip next 3 empties if 3 in a row, store current block
        if i+3 < n:
            nxt1 = lines[i+1]
            nxt2 = lines[i+2]
            nxt3 = lines[i+3]
            if (not nxt1 or str(nxt1).isspace()) and (not nxt2 or str(nxt2).isspace()) and (not nxt3 or str(nxt3).isspace()):
                skip_empties = 3
                if cleaning:
                    clean = "\n".join(cleaning)
                    row = (len(global_cleans)+1, f"passage: {clean}")
                    global_cleans.append(row)
                    cleaning = []
        i+=1
    
    if cleaning:
        clean = "\n".join(cleaning)
        row = (len(global_cleans)+1, f"passage: {clean}")
        global_cleans.append(row)

def is_html(filename):
    return str(filename).endswith(".html")

def recurse_html(path):
    filenames = os.listdir(path)
    for filename in filenames:
        file = os.path.join(path, filename)
        if os.path.isdir(file):
            recurse_html(file)
        elif is_html(filename):
            article_html = get_article_html(file)
            plaintext = get_html_as_plaintext(article_html)
            clean_article_plaintext(plaintext)

html_path = "."
recurse_html(html_path)
with open("godot_doc_sections.csv", "w", encoding='utf-8', newline="") as f:
    csv_out = csv.writer(f)
    csv_out.writerow(["id", "section"])
    for row in global_cleans:
        try:
            csv_out.writerow(row)
        except Exception as e:
            print(str(e))
            print(row)

In [103]:
import csv
with open("godot_doc_sections.csv", "r", encoding='utf-8') as f:
    spamreader = csv.reader(f, delimiter=',')
    for row in spamreader:
        print(len(row[1]), row)

7 ['id', 'section']
472 ['1', "passage: Page not found\n\n            Sorry, we couldn't find that page. It may have been renamed or removed\n            in the version of the documentation you're currently browsing.\n        \n\n            If you're currently browsing the\n            latest version of the documentation, try browsing the\n            stable version of the documentation.\n        \n\n            Alternatively, use the\n            Search docs\n            box on the left or go to the homepage."]
1547 ['3', 'passage: Requirements\nIn the case of the MIT license, the only requirement is to include the license\ntext somewhere in your game or derivative project.\nThis text reads as follows:\n\nThis game uses Godot Engine, available under the following license:\nCopyright (c) 2014-present Godot Engine contributors.\nCopyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associ