In [1]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import unicodedata
import re

# General

In [145]:
CACHE_FOLDER = "cache"
PLAIN_TEXT_FOLDER = "plain_text"
ORIGINAL_TEXT_FOLDER = "original_text"

In [3]:
def load_content(page, params=None):
  if not os.path.exists(CACHE_FOLDER):
    os.makedirs(CACHE_FOLDER)

  #load content
  content = ""
  cache_location = os.path.join(CACHE_FOLDER,page.split('/')[-1].replace('/', '_'))
  if params is not None:
    params_hash = hash(frozenset(params.items()))
    cache_location += "_" + str(params_hash)
  if not os.path.exists(cache_location):
    result = requests.get(page, allow_redirects=True, params=params)
    content = result.text
    with open(cache_location, 'w') as f:
      f.write(content)
  else:
    with open(cache_location) as f:
      content = f.read()
  return content

In [4]:
def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

# Scrape all articles

In [14]:
def get_all_links():
  df = pd.read_csv('./meta.csv', header=None)
  all_links = df[:][0]

  leichte_sprache_links = set()
  leichte_sprache_links.update(all_links)

  return list(leichte_sprache_links)

all_links = get_all_links()

In [15]:
titles = [slugify(sub.split('/')[-2].replace('.html','')) for sub in all_links]

link_dataframe = pd.DataFrame(data={'link':all_links,'title':titles})
link_dataframe.to_csv('meta.csv', index=False)

In [16]:
#skip previous cells if you allready scraped the links
link_dataframe = pd.read_csv('meta.csv')

In [181]:
#Define specific processing
MULTIPLE_FULLSTOPS = re.compile(r"\(…\)")
DUPLICATED_NEW_LINES = re.compile(r"(?<=\n\n)\s*\n")
DUPLICATED_SPACES = re.compile(r" +")

REMOVE = {
  MULTIPLE_FULLSTOPS:'',
  DUPLICATED_NEW_LINES:'',
  DUPLICATED_SPACES:' '
}

def process_text(text):

  for remove_item,replacement in REMOVE.items():
    text = remove_item.sub(replacement, text)

  return text.strip()
  
def get_original_link(text):

  for remove_item,replacement in REMOVE.items():
    text = remove_item.sub(replacement, text)

  return text.strip()

In [218]:
def to_parallel_list(soup, link, phrases):
  to_remove = []
  to_remove.extend(soup.find_all('h1'))
  to_remove.extend(soup.find_all('h2'))
  to_remove.extend(soup.find_all('a'))
  to_remove.extend(soup.find_all('span', {'class','like-h1 like-h2'}))
  to_remove.extend(soup.find_all('span', {'class','text-small'}))
  to_remove.extend(soup.find_all('strong'))
  for remove_tag in to_remove:
    if remove_tag is not None:
      remove_tag.decompose()

  clusters = soup.find_all('section',attrs={'class': 'textblock container'})
  counter = 1
  normal_paragraph = ""
  simple_paragraph = ""
  for cluster in clusters:
    paragraphs = cluster.find_all('p')

    for paragraph in paragraphs:
        span = paragraph.find('span')
        if (span is not None):
          simple_paragraph += process_text(paragraph.get_text(separator=" ").replace('\n',' '))+" "
          # print("simple_paragraph",simple_paragraph)
          if normal_paragraph.strip() != "" and simple_paragraph.strip() != "":
            phrases.append({
                'normal_phrase': normal_paragraph.strip(), 
                'simple_phrase': simple_paragraph.strip(),
                'source': link
                })
          normal_paragraph = ""
          simple_paragraph = ""
        else:
          normal_paragraph += process_text(paragraph.get_text(separator=" ").replace('\n',' '))+" "
          # print("normal_paragraph",normal_paragraph)
        print("Sentence {} processed".format(counter))
        counter += 1

def scrape_article(html, phrases, link):
  soup = BeautifulSoup(html)
  article_header_title = soup.find("h2")
  if article_header_title is not None:
    title = article_header_title.get_text().strip()
  else:
    title = None
  to_parallel_list(soup, link, phrases)


In [None]:
#load all all links
phrases = []
total_length = len(link_dataframe.index)
for index, row in link_dataframe.iterrows():
  link = row['link']
  print(link)
  content = load_content(link)

  scrape_article(content, phrases, link)
  if index % 10 == 0:
    print(f"Loaded ({index}/{total_length})")
  # break

print(phrases)
df = pd.DataFrame(phrases)


# Save scraped data

In [220]:
df.to_csv('brandeins.csv', index=False)