## Imports and definitions

In [None]:
### --- default imports ---

import json
import pandas as pd
from os import listdir, makedirs
from os.path import isfile, join
from datetime import datetime
import re
import gzip
from bs4 import BeautifulSoup
from html import unescape

from constants import DATA_BASE, ETL_PATH, NLP_PATH, LOCL_PATH, FULL_PATH, \
    DATASET, SUBSET, ID, ID2, TITLE, TIME, META, TEXT, DESCR, LINKS, TAGS, DATA, HASH

In [None]:
### --- store meta data and content ---

def store(corpus, df):
    """returns the file name where the dataframe was stores"""
    makedirs(ETL_PATH, exist_ok=True)
    fname = join(ETL_PATH, corpus + '.pickle')
    print('saving to', fname)
    df.to_pickle(fname)
    return fname

def read(f):
    return pd.read_pickle(f)

## Actions

In [None]:
df

In [None]:
fname = store(CORPUS, df)

In [None]:
# READ, PROCESS and *STORE*

files = sorted([f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f)) 
                and f[:3] == 'dew'
               ])

for name in files:
    fname = join(FULL_PATH, name)
    df = read(fname)
    df['doc_id'] = df['doc_id'].str.strip()
    df['title']  = df['title'].str.strip()
    print('saving to', fname)
    #df.to_pickle(fname)

## Pipelines for certain corpora

In [None]:
### --- extract, transform, load (save) the following corpus:

CORPUS = "OnlineParticipation"
LOCL_PATH = "OnlineParticipationDatasets/downloads"
FULL_PATH = join(DATA_BASE, LOCL_PATH)

def transform_subset(source, subset_name: str):
    """
    :param source: list of dictionaries in original key/value format
    :param subset_name: string identifier of the subset the data belongs to
    :return: list of dictionaries in standard key/value format
    """
    category_lookup = {}
    print('transform', subset_name)

    for doc in source:
        target = dict()
        target[DATASET] = CORPUS
        target[SUBSET]  = subset_name
        target[ID]      = doc['suggestion_id']
        target[TITLE]   = doc['title']
        target[TIME]    = doc['date_time']
        target[DESCR]   = None

        # wuppertal has a different data scheme
        if subset_name == 'wuppertal2017':
            if 'tags' in doc:
                target[TAGS] = tuple(doc['tags'])
                category_lookup[target[ID]] = target[TAGS]
            else:
                target[TAGS] = category_lookup[target[ID]]
            target[ID2]      = None
            target[TEXT]     = doc['content'] + ' .\n' \
                             + doc['Voraussichtliche Rolle für die Stadt Wuppertal'] + ' .\n' \
                             + doc['Mehrwert der Idee für Wuppertal'] + ' .\n'
                           # + doc['Eigene Rolle bei der Projektidee'] + ' .\n'
                           # + doc['Geschätzte Umsetzungsdauer und Startschuss'] + ' .\n'
                           # + doc['Kostenschätzung der Ideeneinreicher'] + ' .\n'
        else:
            if 'category' in doc:
                target[TAGS] = doc['category']
                category_lookup[target[ID]] = target[TAGS]
            else:
                target[TAGS] = category_lookup[target[ID]]
            target[ID2]      = doc['comment_id'] if ('comment_id' in doc) else 0
            target[LINKS]    = target[ID] if target[ID2] else None
            # ignore if no content
            if not doc['content']:
                continue
            target[TEXT]     = doc['content']
        
        target[HASH] = hash(tuple([target[key] for key in META]))
        yield target


def load_data(number_of_subsets: int=None, start: int=0):
    """
    :param number_of_subsets: number of subsets to process in one call (None for no limit)
    :param start: index of first subset to process
    :yield: data set name, data subset name, data json
    """
    
    print("process", CORPUS)
    
    # --- read files ---
    files = [f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))]

    if number_of_subsets:
        number_of_subsets += start
        if number_of_subsets > len(files):
            number_of_subsets = None

    for name in files[start:number_of_subsets]:
        if name[-9:-5] != 'flat':
            continue
        
        fpath = join(FULL_PATH, name)
        try: 
            with open(fpath, 'r') as fp:
                print('open:', fpath)
                data = json.load(fp)
                if not data:
                    continue
        except IOError:
            print("Could not open", fpath)
            continue
        subset = name[6:-10]

        yield transform_subset(data, subset)


dfs = [pd.DataFrame(item) for item in load_data()]
if dfs:
    df = pd.concat(dfs)
    del dfs
    df = df.set_index(HASH)[META+DATA]

In [None]:
### --- extract, transform, load (save) the following corpus:

CORPUS = ["FAZ", "FOCUS"][0]
LOCL_PATH = "scrapy/" + CORPUS.lower()
FULL_PATH = join(DATA_BASE, LOCL_PATH)

depth = {
    "FAZ": 4,
    "FOCUS": 3,
}

re_date = re.compile(r'^(.*?\+\d\d):(\d\d)$')

def transform_subset(source, subset_name: str):
    """
    :param source: list of dictionaries in original key/value format
    :param subset_name: string identifier of the subset the data belongs to
    :return: list of dictionaries in standard key/value format
    """
    print('transform', subset_name)

    for doc in source:
        target = dict()
        target[DATASET]  = CORPUS
        target[SUBSET]   = doc['url'].split('/')[depth[CORPUS]]
        target[ID]       = doc['url']
        target[ID2]      = None
        target[TITLE]    = doc['title']
        target[LINKS]    = None
        if doc['published']:
            date = re_date.sub(r'\g<1>\g<2>', doc['published'])
            target[TIME] = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z')  
                         # FAZ:   2018-08-22T11:11:45+0200
                         # FOCUS: 2013-02-15T12:10:49+01:00
        else: 
            target[TIME] = None
        target[TEXT]     = doc['text']
        target[DESCR]    = doc['description']
        target[TAGS]     = tuple(doc['keywords'])

        target[HASH] = hash(tuple([target[key] for key in META]))
        yield target


def load_data(number_of_subsets: int=None, start: int=0):
    """
    :param number_of_subsets: number of subsets to process in one call (None for no limit)
    :param start: index of first subset to process
    :yield: data set name, data subset name, data json
    """
    
    print("process", CORPUS)
    
    # --- read files ---
    files = [f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))]
    print(files)

    if number_of_subsets:
        number_of_subsets += start
        if number_of_subsets > len(files):
            number_of_subsets = None

    for name in files[start:number_of_subsets]:
        fpath =join(FULL_PATH, name)
        try: 
            with open(fpath, 'r') as fp:
                print('open:', fpath)
                data = [json.loads(d) for d in fp.readlines()]
                if not data:
                    continue
        except IOError:
            print("Could not open", fpath)
            continue
        subset = name[len(CORPUS)+1:-3]

        yield transform_subset(data, subset)


dfs = [pd.DataFrame(item) for item in load_data()]
if dfs:
    df = pd.concat(dfs)
    del dfs
    df = df.set_index(HASH)[META+DATA]

In [None]:
### --- extract, transform, load (save) the following corpus:

CORPUS = "Europarl"
LOCL_PATH = "Europarl/Europarl/xml/de"
FULL_PATH = join(DATA_BASE, LOCL_PATH)

re_ws = re.compile(r'& #160 ;')

def transform_subset(source, subset_name: str):
    """
    :param source: list of dictionaries in original key/value format
    :param subset_name: string identifier of the subset the data belongs to
    :return: list of dictionaries in standard key/value format
    """
    print('transform', subset_name)
    
    soup = BeautifulSoup(source, 'xml')

    for chapter in soup.find_all('CHAPTER'):
        target = dict()
        target[DATASET] = CORPUS
        target[SUBSET]  = subset_name
        target[ID]      = subset_name
        target[ID2]     = chapter.attrs['ID']
        first = chapter.find(["SPEAKER", "P"])
        if first.name == 'SPEAKER':
            title = first.get('NAME')
        elif first.name == 'P':
            title = ' '.join([w.string for w in chapter.find('s').find_all('w')])
        target[TIME]    = datetime.strptime(subset_name[3:11], '%y-%m-%d')  # ep-07-01-18-009-07
        
        text = []
        first = True
        for paragraph in chapter.find_all(["SPEAKER", "P"]):
            if paragraph.name == 'SPEAKER':
                text.append(paragraph.get('NAME'))
            elif paragraph.name == 'P':
                text.append(' '.join([w.string for w in paragraph.find_all('w')]))
        text = '\n'.join(text)
        target[TITLE]   = re_ws.sub(" ", unescape(title)).strip()
        target[TEXT]    = re_ws.sub(" ", unescape(text[len(title):])).strip()
        target[TAGS]    = None
        target[LINKS]   = None
        target[DESCR]   = None

        target[HASH] = hash(tuple([target[key] for key in META]))
        yield target


def load_data(number_of_subsets: int=None, start: int=0):
    """
    :param number_of_subsets: number of subsets to process in one call (None for no limit)
    :param start: index of first subset to process
    :yield: data set name, data subset name, data json
    """
    
    print("process", CORPUS)
    
    # --- read files ---
    files = [f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))]

    if number_of_subsets:
        number_of_subsets += start
        if number_of_subsets > len(files):
            number_of_subsets = None

    for name in files[start:number_of_subsets]:
        fpath = join(FULL_PATH, name)
        try:
            with gzip.open(fpath, 'rb') as fp:
                data = fp.read()
        except IOError:
            print("Could not open", fpath)
            continue
        subset = name[:-7]

        yield transform_subset(data, subset)


dfs = [pd.DataFrame(item) for item in load_data(number_of_subsets=None)]
if dfs:
    df = pd.concat(dfs)
    del dfs
    df = df.set_index(HASH)[META+DATA]

In [None]:
### --- extract, transform, load (save) the following corpus:

CORPUS = "PoliticalSpeeches"
LOCAL_PATH = "German-political-speeches-2018-release"
FULL_PATH = join(DATA_BASE, LOCAL_PATH)

months = dict(Januar='01', Februar='02', März='03', April='04', Mai='05', Juni='06',
              Juli='07', August='08', September='09', Oktober='10', November='11', Dezember='12')
pattern = re.compile(r'(' + '|'.join(months.keys()) + r')')

def transform_subset(source, subset_name: str):
    """
    :param source: list of dictionaries in original key/value format
    :param subset_name: string identifier of the subset the data belongs to
    :return: list of dictionaries in standard key/value format
    """
    print('transform', subset_name)
    
    soup = BeautifulSoup(source, 'xml')

    for speech in soup.find_all('text'):
        target = dict()
        target[DATASET] = CORPUS
        target[SUBSET]  = subset_name
        target[ID]      = speech.get('url')
        target[ID2]     = None
        target[TITLE]   = speech.get('titel').strip()
        target[TAGS]    = speech.get('person')
        target[LINKS]   = None
        if speech.attrs['datum']:
            match = pattern.search(speech.attrs['datum'])
            if match:
                datum = speech.attrs['datum'].replace(" ", "")
                time = pattern.sub(lambda key: months[key.group()] + '.', datum)
                target[TIME] = datetime.strptime(time, '%d.%m.%Y')
            else:
                target[TIME] = datetime.strptime(speech.attrs['datum'], '%d.%m.%Y')
        else:
            target[TIME]     = None

        target[TEXT]   = speech.find('rohtext').string.strip()
        target[DESCR]  = speech.get('untertitel')

        target[HASH] = hash(tuple([target[key] for key in META]))
        yield target


def load_data(number_of_subsets: int=None, start: int=0):
    """
    :param number_of_subsets: number of subsets to process in one call (None for no limit)
    :param start: index of first subset to process
    :yield: data set name, data subset name, data json
    """
    
    print("process", CORPUS)
    
    # --- read files ---
    files = [f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))]

    if number_of_subsets:
        number_of_subsets += start
        if number_of_subsets > len(files):
            number_of_subsets = None

    for name in files[start:number_of_subsets]:
        if name[-3:] != 'xml':
            continue
        
        fpath = join(FULL_PATH, name)
        try:
            with open(fpath, 'r') as fp:
                data = fp.read()
        except IOError:
            print("Could not open", fpath)
            continue
        subset = name[:-4]

        yield transform_subset(data, subset)


dfs = [pd.DataFrame(item) for item in load_data(number_of_subsets=None)]
if dfs:
    df = pd.concat(dfs)
    df = df.set_index(HASH)[META+DATA]

In [None]:
### --- extract, transform, load (save) the following corpus:

CORPUS = "dewac"
LOCAL_PATH = "WaCKy/dewac"
FULL_PATH = join(DATA_BASE, LOCAL_PATH)

mn = 12

def transform_line(source, subset):
    """
    :param source: list of dictionaries in original key/value format
    :param subset_name: string identifier of the subset the data belongs to
    :return: list of dictionaries in standard key/value format
    """
    if source['url'][:mn] != "CURRENT URL " or source['text'][:mn] == "CURRENT URL ":
        # print('INFO: bad formatted doc')
        return None
    target = dict()
    target[DATASET] = CORPUS
    target[SUBSET]  = subset
    target[ID]      = source['url'][mn:].strip()
    target[ID2]     = None
    target[TITLE]   = target[ID].split('/')[-1]
    target[TAGS]    = None
    target[TIME]    = None
    target[TEXT]    = source['text'].strip()
    target[DESCR]   = None
    target[LINKS]   = None
    target[HASH]    = hash(tuple([target[key] for key in META]))
    return target


def transform_subset(fp, subset, number_of_documents):
    url = ''
    text = ''
    for i in range(number_of_documents * 2):
        line = fp.readline()
        if line[:mn] != "CURRENT URL ":
            text += line
        else:
            data = {'url': url, 'text': text}
            url = line
            text = ''
            if i:
                row = transform_line(data, subset)
                if row:
                    yield row

def load_data(number_of_subsets: None, number_of_documents=100000):
    """
    :param number_of_subsets: number of subsets to process in one call (None for no limit)
    :param start: index of first subset to process
    :yield: data set name, data subset name, data json
    """
    print("process", CORPUS)
    
    # --- read files ---
    name = 'dewac_preproc'
    fpath = join(FULL_PATH, name)
    print(fpath)
    with open(fpath, 'r', encoding='latin-1') as fp:
        i = 1
        count_total = 0
        while True:
            if number_of_subsets and i > number_of_subsets:
                print('limit of subsets reached')
                break
            print("process subset {:02d}".format(i))
            dfs = [item for item in transform_subset(fp, i, number_of_documents)]
            count_tmp = len(dfs)
            count_total += count_tmp
            print("{:d} documents processed ({:d} in total)".format(count_tmp, count_total))
            if dfs:
                df = pd.DataFrame.from_dict(dfs).set_index(HASH, drop=True)[META+DATA]
                del dfs
                store(("%s_%02d" % (CORPUS, i)), df)
            else:
                print('no more documents')
                break
            i += 1


load_data(number_of_subsets=None, number_of_documents=55000)