In [None]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
import urllib
import unicodedata

#General stuff

In [None]:
CACHE_FOLDER = "cache"
PLAIN_TEXT_FOLDER = "plain_text"

In [None]:
def load_content(page, params=None):
  if not os.path.exists(CACHE_FOLDER):
    os.makedirs(CACHE_FOLDER)

  #load content
  content = ""
  cache_location = os.path.join(CACHE_FOLDER,page.split('/')[-1].replace('/', '_'))
  if params is not None:
    params_hash = hash(frozenset(params.items()))
    cache_location += "_" + str(params_hash)
  if not os.path.exists(cache_location):
    result = requests.get(page, allow_redirects=True, params=params)
    content = result.text
    with open(cache_location, 'w') as f:
      f.write(content)
  else:
    with open(cache_location) as f:
      content = f.read()
  return content

In [None]:
def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

#Scrape all hurraki data

In [None]:
URL_ALL_PAGES = "https://hurraki.de/w/api.php?action=query&format=json&list=allpages&aplimit=500&apfilterredir=nonredirects&apfrom={}"
URL_GET_PAGE = "https://hurraki.de/w/api.php?action=parse&prop=categories|text&format=json&page={}"

In [None]:
def get_all_titles():

  apfrom = "A"
  all_pages = []

  #iterate over all pages (max. 500 entries per page)
  while apfrom is not None:

    #load data
    data = json.loads(load_content(URL_ALL_PAGES.format(apfrom)))
    pages = data["query"]["allpages"]

    for page in pages:
      if page["title"].startswith("Hurraki:"):
        continue
      all_pages.append(page["title"])

    if "continue" in data.keys():
      apfrom = data['continue']['apcontinue']
    else:
      apfrom = None

  return all_pages

all_titles = get_all_titles()

In [None]:
full_links = [URL_GET_PAGE.format(urllib.parse.quote(sub)) for sub in all_titles]

slugified_titles = [slugify(title) for title in all_titles]
link_dataframe = pd.DataFrame(data={'title':slugified_titles,'link':full_links})
link_dataframe.to_csv('meta.csv', index=False)

In [None]:
#skip previous cells if you allready scraped the links
link_dataframe = pd.read_csv('meta.csv', keep_default_na=False)

In [None]:
#Define specific processing

IMAGE_CAPTION = re.compile(r"Auf dem Bild ist __*_ zu sehen")
DUPLICATED_NEW_LINES = re.compile(r"(?<=\n\n)\s*\n")
UNIFORM_BULLET_POINTS = re.compile(r"(?<=\n)\s*- *")

REMOVE = {
  IMAGE_CAPTION:'',
  DUPLICATED_NEW_LINES:'',
  UNIFORM_BULLET_POINTS:'• ',
}

def process_text(text):
  for remove_item, replacement in REMOVE.items():
    text = remove_item.sub(replacement, text)

  return text.strip()

In [None]:
def to_plain_text(html):
  LONG_DESCRIPTION_SEPARATOR = """<span class="mw-headline" id="Genaue_Erkl.C3.A4rung">Genaue Erklärung</span>"""

  #split sections
  splitted = html.split("<span class=\"mw-headline\"")
  #reconstruct sections
  for i in range(1,len(splitted)):
    splitted[i] = "<span class=\"mw-headline\"" + splitted[i]

  long_description_html = ""
  short_description_html = splitted[0]

  for section in splitted:
    if LONG_DESCRIPTION_SEPARATOR in section:
      long_description_html = section
      break

  html = short_description_html + "\n" + long_description_html
  soup = BeautifulSoup(html)

  #format lists to recognize them with regex
  for ul_list in soup.find_all('ul') + soup.find_all('ol'):

    for item in ul_list.find_all('li'):
      if item.text.strip() != "":
        item.string = "• " + item.text.strip()

  #remove all divs
  to_remove = []
  to_remove.extend(soup.find_all('div'))
  to_remove.extend(soup.find_all('table'))
  to_remove.extend(soup.find_all('span', attrs={"class":"mw-editsection"}))
  to_remove.extend(soup.find_all('span', attrs={"class":"mw-headline"}))
  to_remove.extend(soup.find_all('h2'))
  to_remove.extend(soup.find_all('h3'))
  #remove hadlines
  for remove_tag in to_remove:
    if remove_tag is not None:
      remove_tag.decompose() 

  plain_text = soup.get_text().strip()

  plain_text = process_text(plain_text)

  return plain_text

In [None]:
def load_and_save_plain_text(link, name=None):

  result = json.loads(load_content(link))
  if 'parse' not in result:
    print(link)
  content = result['parse']['text']['*']
  categories = []
  for item in result['parse']['categories']:
    categories.append(item['*'])
  category = "|".join(categories)

  if name is None:
    name = link.split('/')[-1]

  name = name.replace('/','_') + '.txt'

  #process article
  plain_text = to_plain_text(content)

  if not os.path.exists(PLAIN_TEXT_FOLDER):
    os.makedirs(PLAIN_TEXT_FOLDER)

  #save plain text
  with open(os.path.join(PLAIN_TEXT_FOLDER , name), 'w+') as f:
    f.write(plain_text)

  return {'genre': category}

In [None]:
#load and parse all pages

exclude_pages = ['Übung: Demoversion',
                 'Übung: jaki',
                 'Übung: jan poka',
                 'Übung: kala 🐟',
                 'Übung: kili 🍎',
                 'Übung: kulupu tomo','Übung: ma Apika',
                 'Übung: ma Elopa',
                 'Übung: ma Italija',
                 'Übung: pali',
                 'Übung: toki mama',
                 'Ubung: kon tawa 🌬️',
                 'Grundrechenart',
                 'Mom',
                 'Jaki',
                 'Konsonant',
                 'Alice Salomon',
                 'Unterstützte Kommunikation']

exclude_pages = [slugify(item) for item in exclude_pages]

total_length = len(link_dataframe.index)
for index, row in link_dataframe.iterrows():
  link = row['link']
  title = row['title']

  if title in exclude_pages:
    continue
  
  info = load_and_save_plain_text(link, name=title)

  #add additional information to meta file
  for key, value in info.items():
    link_dataframe.at[index,key] = value

  print(f"Loaded ({index}/{total_length})")

#Save scraped data

In [None]:
!zip -r /content/corpus.zip /content/plain_text -j
link_dataframe.to_csv('meta.csv', index=False)