In [None]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import unicodedata

#General stuff

In [None]:
CACHE_FOLDER = "cache"
PLAIN_TEXT_FOLDER = "plain_text"

In [None]:
def load_content(page, params=None):
  if not os.path.exists(CACHE_FOLDER):
    os.makedirs(CACHE_FOLDER)

  #load content
  content = ""
  cache_location = os.path.join(CACHE_FOLDER,page.split('/')[-1].replace('/', '_'))
  if params is not None:
    params_hash = hash(frozenset(params.items()))
    cache_location += "_" + str(params_hash)
  if not os.path.exists(cache_location):
    result = requests.get(page, allow_redirects=True, params=params)
    content = result.text
    with open(cache_location, 'w') as f:
      f.write(content)
  else:
    with open(cache_location) as f:
      content = f.read()
  return content

In [None]:
def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

#Scrape all news

In [None]:
URL_PREFIX = "https://www.lebenshilfe.de"
URL = "https://www.lebenshilfe.de/suche?tx_solr%5Bfilter%5D%5B0%5D=languagemarker_stringS%3Aeasylanguage&tx_solr%5Bpage%5D={}&tx_solr%5Bq%5D=%2A"

In [None]:
def get_all_links():
  all_links = []

  page = 1
  while True:

    current_url = URL.format(page)
    content = load_content(current_url)
    soup = BeautifulSoup(content)

    article_list = soup.find('div', attrs={'class':"tx_solr"})
    all_articles = article_list.find_all('section',attrs={'class':"list__item"})

    if len(all_articles) == 0:
      break
    for article in all_articles:
      link = article.find('a',href=True)
      all_links.append(link['href'])


    page += 1
  
  return all_links

all_links = get_all_links()

In [None]:
full_links = [URL_PREFIX + sub for sub in all_links]

link_dataframe = pd.DataFrame(data={'link':full_links})
link_dataframe.to_csv('meta.csv', index=False)

In [None]:
#skip previous cells if you allready scraped the links
link_dataframe = pd.read_csv('meta.csv')

In [None]:
#Define specific processing

INDENTATION = re.compile(r"(?<=\n)\s+")
LINE = re.compile(r"Möchtest Du Dir ein Fremd·wort wünschen\? .*@einfachstars\.info.*")
DUPLICATED_NEW_LINES = re.compile(r"(?<=\n\n)\s*\n")

REMOVE = {
  INDENTATION:'',
  LINE:'\n',
  DUPLICATED_NEW_LINES:'',
}

def process_text(text):

  for remove_item,replacement in REMOVE.items():
    text = remove_item.sub(replacement, text)

  return text.strip()

In [None]:
def to_plain_text(html):
  soup = BeautifulSoup(html)
  article = soup.find('article',attrs={'class':"article"})
  to_remove = []
  
  try:
    title = article.find("h1").get_text().strip()
    if (title == "Wörterbuch"):
      title = article.find("h2").get_text().strip()
      to_remove.extend(article.find_all("div", {"class" : "row textrow textrow--intro mb-5"}))
  except:
    print("Not a valid article")
    return None, None
  
  #remove titles
  to_remove.extend(article.find_all('h1'))
  to_remove.extend(article.find_all('h2'))
  to_remove.extend(article.find_all("div", {"class" : re.compile('.*widget.*')}))
  to_remove.extend(article.find_all("a", {"class" : re.compile('.*link.*')}))
  to_remove.extend(article.find_all('figure'))
  
  for remove_tag in to_remove:
    if remove_tag is not None:
      remove_tag.decompose()

  regex = re.compile('.*text.*')
  plain_text = ""
  for article_part in article.find_all("div", {"class" : regex}):
    # for the case when both a descandant and ancestor divs contain "text"
    if len(article_part.find_all("div", {"class" : regex})) > 0:
      continue
    plain_text = plain_text + " " + article_part.get_text()

  plain_text = process_text(plain_text)

  return plain_text, title

In [None]:
def load_and_save_plain_text(link, name=None):
  content = load_content(link)

  if name is None:
    name = link.split('/')[-1]

  name = name.replace('/','_') + '.txt'

  #process article
  plain_text, title = to_plain_text(content)

  if plain_text == None or title == None:
    return {}

  title = slugify(title)
  name = slugify(title) + ".txt"

  if not os.path.exists(PLAIN_TEXT_FOLDER):
    os.makedirs(PLAIN_TEXT_FOLDER)

  #save plain text
  with open(os.path.join(PLAIN_TEXT_FOLDER , name), 'w+') as f:
    f.write(plain_text)

  return {'genre': 'news', 'title':title}

In [None]:
#load all all links

total_length = len(link_dataframe.index)
for index, row in link_dataframe.iterrows():
  link = row['link']
  
  info = load_and_save_plain_text(link)

  #add additional information to meta file
  for key, value in info.items():
    link_dataframe.at[index,key] = value

  print(f"Loaded ({index}/{total_length})")

#Save scraped data

In [None]:
!zip -r /content/corpus.zip /content/plain_text -j
link_dataframe.to_csv('meta.csv', index=False)