In [None]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import unicodedata

# General stuff

In [None]:
CACHE_FOLDER = "cache"
PLAIN_TEXT_FOLDER = "plain_text"

In [None]:
def load_content(page, params=None):
  if not os.path.exists(CACHE_FOLDER):
    os.makedirs(CACHE_FOLDER)

  #load content
  content = ""
  cache_location = os.path.join(CACHE_FOLDER,page.split('/')[-1].replace('/', '_'))
  if params is not None:
    params_hash = hash(frozenset(params.items()))
    cache_location += "_" + str(params_hash)
  if not os.path.exists(cache_location):
    result = requests.get(page, allow_redirects=True, params=params)
    content = result.text
    with open(cache_location, 'w') as f:
      f.write(content)
  else:
    with open(cache_location) as f:
      content = f.read()
  return content

In [None]:
def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

# Scrape all regular news

In [None]:
URL_PREFIX = "https://einfachstars.info"
URL = "https://einfachstars.info/blog/index.html?page={}"

In [None]:
def get_all_links():

  all_links = []

  page = 1
  while True:

    current_url = URL.format(page)
    content = load_content(current_url)
    soup = BeautifulSoup(content)

    all_articles = soup.find_all('article',attrs={'class':"post"})

    if len(all_articles) == 0:
      break
    for article in all_articles:
      link = article.find('h1',attrs={'class':"title"}).find('a',href=True)
      all_links.append(link['href'])

    print(current_url)

    page += 1

  return all_links

all_links = get_all_links()

In [None]:
full_links = [URL_PREFIX + sub for sub in all_links]
titles = [slugify(sub.split('/')[-1].replace('.html','')) for sub in all_links]

link_dataframe = pd.DataFrame(data={'link':full_links,'title':titles})
link_dataframe.to_csv('meta.csv', index=False)

In [None]:
#skip previous cells if you allready scraped the links
link_dataframe = pd.read_csv('meta.csv')

In [None]:
#Define specific processing

INTRODUCTION = re.compile(r"In der Leichten Sprache werden nur wenige Fremd·wörter benutzt\.(\n|.)*Heute:.*\?")
FOOTER = re.compile(r"Möchtest Du Dir ein Fremd·wort wünschen\? .*@einfachstars\.info.*")
HINT = re.compile(r"Diese Erklärung hat sich eine Leserin von Einfachstars gewünscht.")

IMAGE_CAPTION_1 = re.compile(r"Hier ist ein Bild .*:\s*\n")
IMAGE_CAPTION_2 = re.compile(r"So sieht .* aus:\s*\n")
IMAGE_CAPTION_3 = re.compile(r"(Hier|In).*kann man .*sehen:")

BRACKETS = re.compile(r"\[[^\[\]]*\]")
DUPLICATED_NEW_LINES = re.compile(r"(?<=\n\n)\s*\n")

REMOVE = {
  INTRODUCTION:'',
  FOOTER:'',
  HINT:'',
  IMAGE_CAPTION_1:'',
  IMAGE_CAPTION_2:'',
  IMAGE_CAPTION_3:'',
  BRACKETS:'',
  DUPLICATED_NEW_LINES:'',
}

def process_text(text):

  for remove_item,replacement in REMOVE.items():
    text = remove_item.sub(replacement, text)

  return text.strip()

In [None]:
def to_plain_text(html):
  html = html.replace('<strong>','')
  html = html.replace('</strong>','')
  soup = BeautifulSoup(html)

  article = soup.find('div',attrs={'class':"body clear"})

  to_remove = []
  to_remove.extend(article.find_all('div'))
  to_remove.extend(article.find_all('iframe'))
  to_remove.extend(article.find_all('blockquote'))
  to_remove.extend(article.find_all('p', attrs={'class':"mailtext"}))

  for remove_tag in to_remove:
    if remove_tag is not None:
      remove_tag.decompose()

  for br in article.find_all("br"):
    br.replace_with("\n")

  plain_text = []
  for line in article.find_all(['p', 'ul']):
    if line.name == 'ul':
      for item in line.find_all('li'):
        plain_text.append("• " + item.get_text(separator=" ").strip())
    else:
      for sub_line in line.get_text(separator=" ").split('\n'):
        plain_text.append(sub_line.strip())
  
  plain_text = "\n".join(plain_text)

  plain_text = process_text(plain_text)

  return plain_text

In [None]:
def load_and_save_plain_text(link, name=None):
  content = load_content(link)

  if name is None:
    name = link.split('/')[-1]

  name = name.replace('/','_') + '.txt'

  #process article
  plain_text = to_plain_text(content)

  if not os.path.exists(PLAIN_TEXT_FOLDER):
    os.makedirs(PLAIN_TEXT_FOLDER)

  #save plain text
  with open(os.path.join(PLAIN_TEXT_FOLDER , name), 'w+') as f:
    f.write(plain_text)

  return {'genre': 'news'}

In [None]:
#load all all links

total_length = len(link_dataframe.index)
for index, row in link_dataframe.iterrows():
  link = row['link']
  title = row['title']
  
  info = load_and_save_plain_text(link, name=title)

  #add additional information to meta file
  for key, value in info.items():
    link_dataframe.at[index,key] = value

  print(f"Loaded ({index}/{total_length})")

# Save scraped data

In [None]:
!zip -r /content/corpus.zip /content/plain_text -j
link_dataframe.to_csv('meta.csv', index=False)