## Imports and definitions

In [1]:
### --- default imports ---

import json
import pickle
import pandas as pd
from os import listdir, makedirs
from os.path import isfile, join
from datetime import datetime

In [2]:
### --- constants definitions ---

DATA_BASE = "../../master_cloud/corpora"
ETL_BASE = "preprocessed"
ETL_PATH = join(DATA_BASE, ETL_BASE)

# standard meta data fields
DATASET = 'dataset'
SUBSET = 'subset'
ID = 'doc_id'
ID2 = 'doc_subid'
TITLE = 'title'
TAGS = 'tags'
TIME = 'date_time'
META = [DATASET, SUBSET, ID, ID2, TITLE, TAGS, TIME]
TEXT = 'text'
HASH = 'hash'

In [11]:
### --- store meta data and content ---

def store(corpus, df):
    """returns the file name where the dataframe was stores"""
    makedirs(ETL_PATH, exist_ok=True)
    fname = join(ETL_PATH, corpus + '.pickle')
    print('saving to', fname)
    df.to_pickle(fname)
    return fname

## Actions

In [7]:
df

Unnamed: 0_level_0,dataset,subset,doc_id,doc_subid,title,tags,date_time,text
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-4506752530308577429,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/politik/inland/heik...,0,„Balancierte Partnerschaft“: Maas plädiert für...,"(Heiko Maas, Bundesregierung, Handelsblatt, US...",2018-08-22 11:11:45+02:00,„Balancierte Partnerschaft“: Maas plädiert für...
4663679441774579580,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/wirtschaft/wohnen/g...,0,Anwesen „Gravetye Manor“: Schmaler Grat der wi...,"(Tom Coward, William Robinson, Gravetye Manor ...",2018-08-02 10:34:07+02:00,Anwesen „Gravetye Manor“: Schmaler Grat der wi...
-4706157980894922098,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/wirtschaft/unterneh...,0,Li Shufu aus China: Daimlers Großaktionär verd...,"(Daimler, ISIN_DE0007100000, Geely Internation...",2018-08-22 11:56:35+02:00,Li Shufu aus China: Daimlers Großaktionär verd...
7223691015827943158,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/wirtschaft/arm-und-...,0,Umfrage von EU-Statistikern: 13 Millionen Deut...,"(Sabine Zimmermann, Eurostat, Die Linke, Urlau...",2018-07-19 11:17:19+02:00,Umfrage von EU-Statistikern: 13 Millionen Deut...
1739084689724326116,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/wirtschaft/der-hand...,0,Überraschende Aussage: IW-Chef Hüther findet m...,"(Michael Hüther, Reuters, Institut der deutsch...",2018-08-08 08:13:22+02:00,Überraschende Aussage: IW-Chef Hüther findet m...
2847175808827939484,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/wirtschaft/kuenstli...,0,Handelskammer rechnet vor: Roboterautos erspar...,"(Georg Merziger, Reuters, BamS, DIHK, General ...",2018-07-15 10:58:41+02:00,Handelskammer rechnet vor: Roboterautos erspar...
172640459006029588,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/wirtschaft/diesel-a...,0,Autohersteller: Nissan räumt Fehler in Abgaste...,"(Nissan, Reuters, Renault, ISIN_FR0000131906, ...",2018-07-09 14:45:34+02:00,Autohersteller: Nissan räumt Fehler in Abgaste...
854782444606605248,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/wirtschaft/eurokris...,0,Asyl und Euro: Merkel spaltet die Europäische ...,"(Macron Wunsch, Angela Merkel, Bundeskanzler, ...",2018-06-21 08:33:07+02:00,Asyl und Euro: Merkel spaltet die Europäische ...
1451419110259654497,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/wirtschaft/diginomi...,0,Archivierung von Videospielen: Stirbt Pacman aus?,"(Nintendo, Love, Videospiel, Videospiele, Game...",2018-08-22 07:23:08+02:00,Archivierung von Videospielen: Stirbt Pacman a...
-3942118830455882197,FAZ,politik_wirtschaft,http://www.faz.net/aktuell/politik/politische-...,0,Brüder im Geist: Galaktische Republik,"(Julia Ebner, Islamisten, Glückskeksweisheit, ...",2018-06-04 12:17:11+02:00,Brüder im Geist: Galaktische Republik .\nMit V...


In [10]:
fname = store(CORPUS, df)

saving to


In [None]:
pd.read_pickle(fname)

## Pipelines for certain corpora

In [None]:
### --- extract, transform, load (save) the following corpus:

CORPUS = "OnlineParticipation"
LOCAL_PATH = "OnlineParticipationDatasets/downloads"
FULL_PATH = join(DATA_BASE, LOCAL_PATH)


def transform_subset(source, subset_name: str):
    """
    :param source: list of dictionaries in original key/value format
    :param subset_name: string identifier of the subset the data belongs to
    :return: list of dictionaries in standard key/value format
    """
    category_lookup = {}
    print('transform', subset_name)

    for doc in source:
        target = dict()
        target[DATASET] = CORPUS
        target[SUBSET] = subset_name
        target[ID] = doc['suggestion_id']
        target[TITLE] = doc['title']
        target[TIME] = doc['date_time']

        # wuppertal has a different data scheme
        if subset_name == 'wuppertal2017':
            if 'tags' in doc:
                target[TAGS] = tuple(doc['tags'])
                category_lookup[target[ID]] = target[TAGS]
            else:
                target[TAGS] = category_lookup[target[ID]]
            target[ID2] = 0
            target[TEXT] = doc['title'] + ' .\n' \
                      + doc['content'] + ' .\n' \
                      + doc['Voraussichtliche Rolle für die Stadt Wuppertal'] + ' .\n' \
                      + doc['Mehrwert der Idee für Wuppertal'] + ' .\n'
                      # + doc['Eigene Rolle bei der Projektidee'] + ' .\n'
                      # + doc['Geschätzte Umsetzungsdauer und Startschuss'] + ' .\n'
                      # + doc['Kostenschätzung der Ideeneinreicher'] + ' .\n'
                    
        else:
            if 'category' in doc:
                target[TAGS] = doc['category']
                category_lookup[target[ID]] = target[TAGS]
            else:
                target[TAGS] = category_lookup[target[ID]]
            target[ID2] = doc['comment_id'] if ('comment_id' in doc) else 0
            # ignore if no content
            if not doc['content']:
                continue
            target[TEXT] = doc['title'] + ' .\n' + doc['content'] if doc['title'] else doc['content']
        
        target[HASH] = hash(tuple([target[key] for key in META]))
        yield target


def load_data(number_of_subsets: int=None, start: int=0):
    """
    :param number_of_subsets: number of subsets to process in one call (None for no limit)
    :param start: index of first subset to process
    :yield: data set name, data subset name, data json
    """
    
    print("process", CORPUS)
    
    # --- read files ---
    files = [f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))]

    if number_of_subsets:
        number_of_subsets += start
        if number_of_subsets > len(files):
            number_of_subsets = None

    for name in files[start:number_of_subsets]:
        if name[-9:-5] == 'flat':
            fpath = os.path.join(FULL_PATH, name)
            try: 
                with open(fpath, 'r') as fp:
                    print('open:', fpath)
                    data = json.load(fp)
                    if not data:
                        continue
            except IOError:
                print("Could not open", fpath)
                continue
            subset = name[6:-10]

            yield transform_subset(data, subset)


dfs = [pd.DataFrame(item) for item in load_data()]
df = pd.concat(dfs)
df = df.set_index(HASH)[META+[TEXT]]

In [6]:
### --- extract, transform, load (save) the following corpus:

CORPUS = "FAZ"
LOCAL_PATH = "scrapy/faz"
FULL_PATH = join(DATA_BASE, LOCAL_PATH)


def transform_subset(source, subset_name: str):
    """
    :param source: list of dictionaries in original key/value format
    :param subset_name: string identifier of the subset the data belongs to
    :return: list of dictionaries in standard key/value format
    """
    category_lookup = {}
    print('transform', subset_name)

    for doc in source:
        target = dict()
        target[DATASET] = CORPUS
        target[SUBSET] = subset_name  # TODO: parse subset name from url
        target[ID] = doc['url']
        target[ID2] = 0
        target[TITLE] = doc['title']
        target[TAGS] = tuple(doc['keywords'])
        if doc['published']:
            target[TIME] = datetime.strptime(doc['published'], '%Y-%m-%dT%H:%M:%S%z')  # 2018-08-22T11:11:45+0200
            # target[TIME] = datetime.fromisoformat(doc['published'])  # may work in Python 3.7
        else: 
            target[TIME] = None
        target[TEXT] = doc['title'] + ' .\n' \
                     + doc['description'] + ' .\n' \
                     + doc['text']

        target[HASH] = hash(tuple([target[key] for key in META]))
        yield target


def load_data(number_of_subsets: int=None, start: int=0):
    """
    :param number_of_subsets: number of subsets to process in one call (None for no limit)
    :param start: index of first subset to process
    :yield: data set name, data subset name, data json
    """
    
    print("process", CORPUS)
    
    # --- read files ---
    files = [f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))]
    print(files)

    if number_of_subsets:
        number_of_subsets += start
        if number_of_subsets > len(files):
            number_of_subsets = None

    for name in files[start:number_of_subsets]:
        fpath =join(FULL_PATH, name)
        try: 
            with open(fpath, 'r') as fp:
                print('open:', fpath)
                data = [json.loads(d) for d in fp.readlines()]
                if not data:
                    continue
        except IOError:
            print("Could not open", fpath)
            continue
        subset = name[4:-3]

        yield transform_subset(data, subset)


dfs = [pd.DataFrame(item) for item in load_data()]
df = pd.concat(dfs)
df = df.set_index(HASH)[META+[TEXT]]

process FAZ
['faz_politik_wirtschaft.jl', 'faz_finanzen_feuilleton.jl']
open: ../../master_cloud/corpora/scrapy/faz/faz_politik_wirtschaft.jl
transform politik_wirtschaft
open: ../../master_cloud/corpora/scrapy/faz/faz_finanzen_feuilleton.jl
transform finanzen_feuilleton


In [None]:
### --- extract, transform, load (save) the following corpus:
import gzip
from bs4 import BeautifulSoup

CORPUS = "Europarl"
LOCAL_PATH = "Europarl/Europarl/xml/de"
FULL_PATH = os.path.join(DATA_BASE, LOCAL_PATH)


def transform_subset(source, subset_name: str):
    """
    :param source: list of dictionaries in original key/value format
    :param subset_name: string identifier of the subset the data belongs to
    :return: list of dictionaries in standard key/value format
    """
    category_lookup = {}
    print('transform', subset_name)
    
    soup = BeautifulSoup(source, 'xml')
    #print(soup.prettify())

    for chapter in soup.find_all('CHAPTER'):
        target = dict()
        target[DATASET] = CORPUS
        target[SUBSET] = subset_name
        target[ID] = subset_name
        target[ID2] = chapter.attrs['ID']
        target[TITLE] = ' '.join([w.string for w in chapter.find('s').find_all('w')])
        target[TAGS] = None
        target[TIME] = datetime.strptime(subset_name[3:11], '%y-%m-%d')  # ep-07-01-18-009-07
        
        text = []
        for paragraph in chapter.find_all(["SPEAKER", "P"]):
            if paragraph.name == 'SPEAKER':
                text.append(paragraph.attrs['NAME'])
            elif paragraph.name == 'P':
                text.append(' '.join([w.string for w in paragraph.find_all('w')]))
        target[TEXT] = '\n'.join(text)

        target[HASH] = hash(tuple([target[key] for key in META]))
        yield target


def load_data(number_of_subsets: int=None, start: int=0):
    """
    :param number_of_subsets: number of subsets to process in one call (None for no limit)
    :param start: index of first subset to process
    :yield: data set name, data subset name, data json
    """
    
    print("process", CORPUS)
    
    # --- read files ---
    files = [f for f in listdir(FULL_PATH) if isfile(join(FULL_PATH, f))]

    if number_of_subsets:
        number_of_subsets += start
        if number_of_subsets > len(files):
            number_of_subsets = None

    for name in files[start:number_of_subsets]:
        fpath = os.path.join(FULL_PATH, name)
        try:
            with gzip.open(fpath, 'rb') as fp:
                data = fp.read()
        except IOError:
            print("Could not open", fpath)
            continue
        subset = name[:-7]

        yield transform_subset(data, subset)


dfs = [pd.DataFrame(item) for item in load_data(number_of_subsets=None)]
df = pd.concat(dfs)
df = df.set_index(HASH)[META+[TEXT]]