# MAKING OF TINY DANTE
tipo tiny shakespeare: https://www.tensorflow.org/datasets/catalog/tiny_shakespeare

In [79]:
!pip install requests bs4

Defaulting to user installation because normal site-packages is not writeable


## struttura della divina commedia:
inferno - 1 > 34

purgatorio - 1 > 33

paradiso - 1 > 33

In [80]:
link = "https://it.wikisource.org/wiki/Divina_Commedia"
link_inferno = link + "/Inferno"
link_purgatorio = link + "/Purgatorio"
link_paradiso = link + "/Paradiso"

In [81]:
def arab_to_roman_number(n):
    """
    converts arabic number to roman number
    eg: 1 -> I
        5 -> V
        10 -> X
        23 -> XXIII
        50 -> L
    """
    roman = ''
    for arab, roman_number in ((1000, 'M'), (900, 'CM'), (500, 'D'), (400, 'CD'),
                               (100, 'C'), (90, 'XC'), (50, 'L'), (40, 'XL'),
                               (10, 'X'), (9, 'IX'), (5, 'V'), (4, 'IV'), (1, 'I')):
        while n >= arab:
            roman += roman_number
            n -= arab
    return roman
    

In [82]:
# test arab_to_roman_number
assert arab_to_roman_number(1) == 'I'
assert arab_to_roman_number(5) == 'V'
assert arab_to_roman_number(10) == 'X'
assert arab_to_roman_number(23) == 'XXIII'
assert arab_to_roman_number(33) == 'XXXIII'
assert arab_to_roman_number(34) == 'XXXIV'
assert arab_to_roman_number(50) == 'L'


In [83]:
import requests
from bs4 import BeautifulSoup

def get_chapter_links(link, n):
    return link + "/" + "Canto_" + arab_to_roman_number(n)


def link_to_soup(link):
    r = requests.get(link)
    if r.status_code != 200:
        print("Error: status code", r.status_code)
        raise Exception("Error: status code", r.status_code)
    return BeautifulSoup(r.text, 'html.parser')

def soup_to_text(soup):
    # get main part
    poem = soup.find("div", {"class":"poem"})
    # removes every span.numeroriga
    for span in poem.find_all("span", {"class":"numeroriga"}):
        span.decompose()
    
    return poem.text

def semplificator(text, also_xa0):
    semplifications = {
        'ä': 'a',
        'ó': 'o',
        'à': 'a',
        'Ë': 'E',
        'ï': 'i',
        'Ï': 'I',
        'ö': 'o',
        'ë': 'e',
        '‘': "'",
        '’': "'",
    }
    if also_xa0:
        semplifications['\xa0'] = ' '
    for k, v in semplifications.items():
        text = text.replace(k, v)
    return text

def link_to_text(link):
    return semplificator(soup_to_text(link_to_soup(link)), True)

In [84]:
all_inferno = { # key: future_filename, value: link
    f"dc_inferno_{i}" : get_chapter_links(link_inferno, i) for i in range(1, 35)
}
all_purgatorio = { # key: future_filename, value: link
    f"dc_purgatorio_{i}" : get_chapter_links(link_purgatorio, i) for i in range(1, 34)
}
all_paradiso = { # key: future_filename, value: link
    f"dc_paradiso_{i}" : get_chapter_links(link_paradiso, i) for i in range(1, 34)
}
all_divina_commedia = {**all_inferno, **all_purgatorio, **all_paradiso}


In [85]:
folder_name = "divina_commedia_parts"
!mkdir $folder_name

for filename, link in all_divina_commedia.items():
    with open(folder_name + "/" + filename+".txt", "w") as f:
        f.write(link_to_text(link))

# also store everything in one file
# by reading already stored files
with open("dc_all.txt", "w") as f:
    for filename, link in all_divina_commedia.items():
        with open(folder_name + "/" + filename+".txt", "r") as f2:
            f.write(f2.read())