In [None]:
import requests
from bs4 import BeautifulSoup
import os
import json
import pandas as pd
import unicodedata
import re

# General stuff

In [None]:
CACHE_FOLDER = "cache"
PLAIN_TEXT_FOLDER = "plain_text"

In [None]:
def load_content(page, params=None):
  if not os.path.exists(CACHE_FOLDER):
    os.makedirs(CACHE_FOLDER)

  #load content
  content = ""
  cache_location = os.path.join(CACHE_FOLDER,page.split('/')[-1].replace('/', '_'))
  if params is not None:
    params_hash = hash(frozenset(params.items()))
    cache_location += "_" + str(params_hash)
  if not os.path.exists(cache_location):
    result = requests.get(page, allow_redirects=True, params=params)
    content = result.text
    with open(cache_location, 'w') as f:
      f.write(content)
  else:
    with open(cache_location) as f:
      content = f.read()
  return content

In [None]:
def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

# Scrape all regular news

In [None]:
ITEMS_PER_LOAD = 100 #Probably 100 is maximum
GENRES = ["nachrichten","kultur-index","sport","vermischtes"]
URL = "https://www.nachrichtenleicht.de/api/partials/PaginatedArticles_NL?drsearch%3AcurrentItems={}&drsearch%3AitemsPerLoad={}&drsearch%3ApartialProps=%7B%22sophoraId%22%3A%22nachrichtenleicht-{}-100%22%7D&drsearch%3A_ajax=1"

In [None]:
def get_all_titles():
  all_links = []
  all_genres = []
  #genre_pages = {}
  for genre in GENRES:
    current_articles = 0
    #load new pages as long as there are any
    while True:
      current_url = URL.format(current_articles,ITEMS_PER_LOAD,genre)
      result = requests.get(current_url, allow_redirects=True)
      content = result.content
      soup = BeautifulSoup(content)

      articles = soup.find_all('article')
      if len(articles) == 0:
        break
        #stop iteration if ther are no more articles
      for article in articles:
        link = article.find('a', href=True)['href']
        all_links.append(link)
        all_genres.append(genre)
        current_articles += 1

      print(current_url)

  return all_links, all_genres

all_links, all_genres = get_all_titles()

In [None]:
titles = [slugify(sub.split('/')[-1].replace('.html','')) for sub in all_links]

link_dataframe = pd.DataFrame(data={'link':all_links,'title':titles, 'genre': all_genres})
link_dataframe.to_csv('meta.csv', index=False)

In [None]:
#skip previous cells if you allready scraped the links
link_dataframe = pd.read_csv('meta.csv')

In [None]:
def to_plain_text(html):
  soup = BeautifulSoup(html)

  for figure in soup.find_all('figure'):
    figure.decompose() 

  header_description = soup.find('p', attrs={'class':"article-header-description"}).get_text().strip()
  article_details = soup.find('section',attrs={'class':"b-article-details"}).get_text().strip()


  article = header_description + "\n" + article_details
  return article

In [None]:
def load_and_save_plain_text(link, name=None):
  content = load_content(link)

  if name is None:
    name = link.split('/')[-1]

  name = name.replace('/','_') + '.txt'

  #process article
  plain_text = to_plain_text(content)

  if not os.path.exists(PLAIN_TEXT_FOLDER):
    os.makedirs(PLAIN_TEXT_FOLDER)

  #save plain text
  with open(os.path.join(PLAIN_TEXT_FOLDER , name), 'w+') as f:
    f.write(plain_text)

  return {}

In [None]:
#load all all links

total_length = len(link_dataframe.index)
for index, row in link_dataframe.iterrows():
  link = row['link']
  title = row['title']

  if link == "":
    continue
  
  info = load_and_save_plain_text(link, name=title)

  #add additional information to meta file
  for key, value in info.items():
    link_dataframe.at[index,key] = value

  print(f"Loaded ({index}/{total_length})")

# Scrape "nachrichtenleicht Wörterbuch" (Dictionary)

In [None]:
URL = "https://www.nachrichtenleicht.de/woerterbuch"

In [None]:
#get links for all pages
def get_all_dictionary_pages():
  result = requests.get(URL, allow_redirects=True)
  content = result.content

  soup = BeautifulSoup(content)
  all_letters = soup.find('ul', attrs={"class":"b-list b-alphabet-links u-space-bottom-xxxl"})
  letter_items = all_letters.find_all('a', href=True)
  all_dictionary_pages = []
  for letter in letter_items:
    all_dictionary_pages.append(letter['href'])

  return all_dictionary_pages

all_dictionary_pages = get_all_dictionary_pages()

In [None]:
#load all pages and process them
for page in all_dictionary_pages:
  content = load_content(URL + page)

  soup = BeautifulSoup(content)
  dictionary = soup.find('ul', attrs={"class":"b-list b-list-teaser-word"})
  if dictionary == None:
    continue
  dictionary_items = dictionary.find_all('div', attrs={"class":"b-teaser-word"})

  for item in dictionary_items:
    title = slugify(item.find('h3', attrs={"class":"teaser-word-title"}).get_text().strip())
    description = item.find('p', attrs={"class":"teaser-word-description"}).get_text().strip()
    
    with open(os.path.join(PLAIN_TEXT_FOLDER , "dict-" + title + ".txt"), 'w+') as f:
      f.write(description)

    link_dataframe = link_dataframe.append({'title': title, 'link':'', 'genre':'dictionary' }, ignore_index=True)

# Save scraped data

In [None]:
!zip -r /content/corpus.zip /content/plain_text -j
link_dataframe.to_csv('meta.csv', index=False)