In [None]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import unicodedata

# General stuff

In [None]:
CACHE_FOLDER = "cache"
PLAIN_TEXT_FOLDER = "plain_text"

In [None]:
def load_content(page, params=None):
  if not os.path.exists(CACHE_FOLDER):
    os.makedirs(CACHE_FOLDER)

  #load content
  content = ""
  cache_location = os.path.join(CACHE_FOLDER,page.split('/')[-1].replace('/', '_'))
  if params is not None:
    params_hash = hash(frozenset(params.items()))
    cache_location += "_" + str(params_hash)
  if not os.path.exists(cache_location):
    result = requests.get(page, allow_redirects=True, params=params)
    content = result.text
    with open(cache_location, 'w') as f:
      f.write(content)
  else:
    with open(cache_location) as f:
      content = f.read()
  return content

In [None]:
def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

# Scrape all regular news

In [None]:
URL_PREFIX = "https://www.ndr.de"
URL = "https://www.ndr.de/fernsehen/barrierefreie_angebote/leichte_sprache/leichtesprachearchiv110_page-{}.html"

In [None]:
def get_all_links():

  all_links = []

  page = 1
  while True:

    current_url = URL.format(page)
    content = load_content(current_url)
    soup = BeautifulSoup(content)

    article_list = soup.find('section', attrs={'class':"w100 singlecolumnlist "})
    all_articles = article_list.find_all('div',attrs={'class':"teaserpadding"})

    if len(all_articles) == 2:
      break
    for article in all_articles:
      link = article.find('a',href=True)
      all_links.append(link['href'])

    print(current_url)

    page += 1

  return all_links

all_links = get_all_links()

In [None]:
full_links = [URL_PREFIX + sub for sub in all_links]
titles = [slugify(sub.split('/')[-1].replace('.html','')) for sub in all_links]

link_dataframe = pd.DataFrame(data={'link':full_links,'title':titles})
link_dataframe.to_csv('meta.csv', index=False)

In [None]:
#skip previous cells if you allready scraped the links
link_dataframe = pd.read_csv('meta.csv')

In [None]:
#Define specific processing

FOOTER_1 = re.compile(r"Diese Nachricht ist vom [0-9]+\. [A-Za-z]+ 20[0-9][0-9], .* Uhr\.")
FOOTER_2 = re.compile(r"Mehr Nachrichten vom .* finden Sie hier\.")
INDENTATION = re.compile(r"(?<=\n)\s+")
LINE = re.compile(r"--+-\n")

REMOVE = {
  FOOTER_1:'',
  FOOTER_2:'',
  INDENTATION:'',
  LINE:'\n',
}

def process_text(text):
  for remove_item, replacement in REMOVE.items():
    text = remove_item.sub(replacement, text)

  return text.strip()

In [None]:
def to_plain_text(html):
  soup = BeautifulSoup(html)

  article = soup.find('article',attrs={'class':"w100"})
  
  to_remove = []
  to_remove.extend(article.find_all('header'))
  to_remove.extend(article.find_all('div',attrs={'class': "module voll"}))
  to_remove.extend(article.find_all('div',attrs={'class': "module halb"}))
  to_remove.extend(article.find_all('div',attrs={'class': "contentbox voll infobox"}))
  to_remove.extend(article.find_all('div',attrs={'class': "contentbox w100 relatedbroadcast"}))
  to_remove.extend(article.find_all('div',attrs={'class': "meta"}))
  to_remove.extend(article.find_all('script'))
  to_remove.extend(article.find_all('div', attrs={'id':"printbox"}))

  #remove titles
  to_remove.extend(article.find_all('h2'))

  for remove_tag in to_remove:
    if remove_tag is not None:
      remove_tag.decompose()

  plain_text = article.text.strip()

  plain_text = process_text(plain_text)

  return plain_text

In [None]:
def load_and_save_plain_text(link, name=None):
  content = load_content(link)

  if name is None:
    name = link.split('/')[-1]

  name = name.replace('/','_') + '.txt'

  #process article
  plain_text = to_plain_text(content)

  if not os.path.exists(PLAIN_TEXT_FOLDER):
    os.makedirs(PLAIN_TEXT_FOLDER)

  #save plain text
  with open(os.path.join(PLAIN_TEXT_FOLDER , name), 'w+') as f:
    f.write(plain_text)

  return {'genre': 'news'}

In [None]:
total_length = len(link_dataframe.index)
for index, row in link_dataframe.iterrows():
  link = row['link']
  title = row['title']
  
  info = load_and_save_plain_text(link, name=title)

  #add additional information to meta file
  for key, value in info.items():
    link_dataframe.at[index,key] = value

  print(f"Loaded ({index}/{total_length})")

# Save scraped data

In [None]:
!zip -r /content/corpus.zip /content/plain_text -j
link_dataframe.to_csv('meta.csv', index=False)