In [41]:
import requests
from bs4 import BeautifulSoup, Tag, NavigableString
import pandas as pd

base_url = "https://it.wikipedia.org/wiki/"

# grab  alist of fundamentalist wikis
lvl2 = f"{base_url}Wikipedia:Voci_fondamentali/Lista2"

In [42]:
def soup2urls(soup):
    content = soup.find("div", {"class": "mw-parser-output"})
    # remove the table of contents
    toc = content.find("div", {"class": "toc"})
    if toc: toc.decompose()
    
    # get all links
    links = content.find_all("a")
    # get the hrefs
    return [link.get("href") for link in links if link.get("href")]

def filter_urls(urls):
    to_remove = ["File", "Progetto", "Portale", "Speciale", "Categoria", "Wikipedia", "Aiuto", "Discussioni", "Templale", "Discussioni_progetto", "Discussioni_Wikipedia", "Template", "Utente", "Aiuto"]
    return set(filter(
        lambda h: not any([h.startswith(t+":") for t in to_remove]),
        urls)
        )

def soup2filtered_urls(soup, wikiprefix = "/wiki/"):
    try:
        hrefs = soup2urls(soup)
        hrefs = [href[len(wikiprefix) :] for href in hrefs if href.startswith(wikiprefix ) ]
        return filter_urls(hrefs)
    except:
        print("error for title" + soup.title.string)
        return set()


In [43]:
# decode url encoded characters
import urllib.parse
import re
import dataclasses


@dataclasses.dataclass
class Cleanupper:
    remove_index: bool = True
    remove_image: bool = True
    remove_h2: bool = True
    remove_h3: bool = True
    remove_h4: bool = True
    remove_double_space: bool = True
    remove_double_newline: bool = True
    remove_disambiguation: bool = True
    remove_see_also: bool = True
    remove_metadata_table: bool = True
    remove_inline_math: bool = True
    remove_quotes: bool = True
    remove_references_section: bool = True
    remove_cite: bool = True

    def remove_index_section(self, s: BeautifulSoup) -> BeautifulSoup:
        index = s.find('div', {'id': 'toc'})
        if index:
            s.find('div', {'id': 'toc'}).decompose()
        return s

    def __call__(self, s: Tag | NavigableString) -> str:        
        if self.remove_index:
            s = self.remove_index_section(s)
        if self.remove_image:
            for img in s.find_all('figure'):
                img.decompose()
        if self.remove_h2:
            for h2 in s.find_all('h2'):
                h2.decompose()
        if self.remove_h3:
            for h3 in s.find_all('h3'):
                h3.decompose()
        if self.remove_h4:
            for h4 in s.find_all('h4'):
                h4.decompose()
        if self.remove_disambiguation:
            for div in s.find_all('div', {"class": "nota-disambigua"}):
                div.decompose()
        if self.remove_see_also:
            for div in s.find_all('div', {"class": "vedi-anche"}):
                if div: div.decompose()
        if self.remove_metadata_table:
            for table in s.find_all('table', {"class": "metadata"}):
                table.decompose()
        if self.remove_inline_math:
            for math in s.find_all('span', {"class": "mwe-math-element"}):
                math.decompose()
        if self.remove_references_section:
            for section in s.find_all('ol', {"class": "references"}):
                section.decompose()
        if self.remove_cite:
            for cite in s.find_all('cite'):
                cite.decompose()

        text = s.text
        if self.remove_double_space:
            text = re.sub(r'\s\s+', ' ', text)
        if self.remove_double_newline:
            text = re.sub(r'\n\n+', '\n', text)
        if self.remove_quotes: # number between []
            text = re.sub(r'\[\d+\]', '', text)


        return text
    
def decode_url(url: str) -> str:
    return urllib.parse.unquote(url).replace("/", "_")


In [44]:

# kinda scraper
def soup2text(soup, cleanupper: Cleanupper = Cleanupper()) -> str:
    content = soup.find("div", {"class": "mw-parser-output"})
    return cleanupper(content)


def url2soup(url: str) -> BeautifulSoup:
    r = requests.get(url)
    return BeautifulSoup(r.text, "html.parser")



In [45]:
import os 
import time
import random


def read_and_download_soup(soup, url, min_page_len=5_000):
    try:
        text = soup2text(soup)
        if len(text) < min_page_len:
            return False 

        with open(f"pages/{decode_url(url)}.txt", 'w+') as f:
            f.write(text)
    except:
        return False

    return True

# initial urls 
initial_urls = soup2filtered_urls(url2soup(lvl2))

os.makedirs("pages", exist_ok=True)
done = set(os.listdir("pages"))

# download recursively
# kinda scraper 
def downlaod_pages(source_urls, level=0, max_level=3, min_page_len=5_000, wait_time=2):
    random.shuffle(source_urls)
    for url in source_urls:
        already_done = decode_url(url) in done
        print(f"level {level} - {'V' if already_done else '?'} - {decode_url(url)}")
        if decode_url(url) in done:
            continue
    
        if wait_time > 0:
            time.sleep(random.random() * wait_time)

        soup = url2soup(f"{base_url}{url}")
        if read_and_download_soup(soup, url, min_page_len=min_page_len):
            done.add(decode_url(url))    

        if level < max_level:
            downlaod_pages(list(soup2filtered_urls(soup)), level=level+1)


downlaod_pages(list(initial_urls))



level 0 - ? - Broadcasting
level 1 - ? - Internet_Protocol
level 2 - ? - Commutazione_(telecomunicazioni)
level 3 - ? - Telecomunicazioni
level 3 - ? - ISO_OSI
level 3 - ? - Teoria_dei_segnali
level 3 - ? - Commutatore_(telecomunicazioni)
level 3 - ? - Rete_di_telecomunicazioni
level 3 - ? - Telefonia
level 3 - ? - Robustezza_(informatica)
level 3 - ? - Multiplazione
level 3 - ? - Canale_(telecomunicazioni)
level 3 - ? - Livello_di_rete
level 3 - ? - Video
level 3 - ? - Nodo_(informatica)
level 3 - ? - Multiplazione#Multiplazione_statistica
level 3 - ? - Commutazione_di_pacchetto
level 3 - ? - Banda_(informatica)
level 3 - ? - Qualità_di_servizio
level 3 - ? - Cavo_elettrico
level 3 - ? - Architettura_di_rete
level 3 - ? - Rete_di_computer
level 3 - ? - Circuito_virtuale
level 3 - ? - Trasmissione_(telecomunicazioni)
level 3 - ? - Fibra_ottica
level 3 - ? - Instradamento
level 3 - ? - File
level 3 - ? - Pacchetto_(reti)
level 3 - ? - Commutazione_a_circuito
level 3 - ? - Velocità
level