### Importing packages

In [1]:
! pip install -U pandas lxml

Collecting pandas
  Using cached https://files.pythonhosted.org/packages/4a/6a/94b219b8ea0f2d580169e85ed1edc0163743f55aaeca8a44c2e8fc1e344e/pandas-1.0.3-cp37-cp37m-manylinux1_x86_64.whl
Collecting lxml
  Using cached https://files.pythonhosted.org/packages/85/9e/93e2c3af278c7c8b6826666bbcb145af2829bd761c3b329e51cd6343836c/lxml-4.5.0-cp37-cp37m-manylinux1_x86_64.whl
Requirement already up-to-date: networkx in /opt/conda/lib/python3.7/site-packages (2.4)
Installing collected packages: pandas, lxml
  Found existing installation: pandas 0.25.1
    Uninstalling pandas-0.25.1:
      Successfully uninstalled pandas-0.25.1
Successfully installed lxml-4.5.0 pandas-1.0.3


In [2]:
import os
import sys
import re
import pickle
import json

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt

import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString

#sys.path.insert(1, '../scripts/utils/')
#from text_filter import spacify_soup

### Retrieving pickle

In [3]:
df_files = pd.read_pickle('../outputs/df_files.pkl')

In [4]:
df_files.head()

Unnamed: 0,path,name,extension,size,folder,depth,parent,uid,main,source,year,date
0,artikel_/APA/1986/APA_19860220.xml,APA_19860220,xml,6142251,False,4,5,1005,artikel_,APA,1986,1986-02-20
1,artikel_/APA/1986/APA_19860102.xml,APA_19860102,xml,3397140,False,4,5,1005,artikel_,APA,1986,1986-01-02
2,artikel_/APA/1986/APA_19860222.xml,APA_19860222,xml,3352934,False,4,5,1005,artikel_,APA,1986,1986-02-22
3,artikel_/APA/1986/APA_19860103.xml,APA_19860103,xml,3794819,False,4,5,1005,artikel_,APA,1986,1986-01-03
4,artikel_/APA/1986/APA_19860221.xml,APA_19860221,xml,5763730,False,4,5,1005,artikel_,APA,1986,1986-02-21


In [5]:
df_files.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175600 entries, 0 to 176739
Data columns (total 12 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   path       175600 non-null  object        
 1   name       175600 non-null  object        
 2   extension  175600 non-null  object        
 3   size       175600 non-null  int64         
 4   folder     175600 non-null  bool          
 5   depth      175600 non-null  int64         
 6   parent     175600 non-null  int64         
 7   uid        175600 non-null  int64         
 8   main       175600 non-null  object        
 9   source     171457 non-null  object        
 10  year       171457 non-null  period[A-DEC] 
 11  date       175600 non-null  datetime64[ns]
dtypes: bool(1), datetime64[ns](1), int64(4), object(5), period[A-DEC](1)
memory usage: 16.2+ MB


### Examining a single file:

In [6]:
path = '/home/jovyan/shared/C_amc_141/R_amc_3.1_12921/203_vert_spacy_rftt/'
idx = 2
print(df_files.loc[idx, 'path'])
filepath = os.path.join(path, df_files.loc[idx, 'path'])


# Parsing with Beautiful Soup
with open(filepath, "r") as f:
    soup = bs(f, "lxml-xml")

artikel_/APA/1986/APA_19860222.xml


### Extracting the article headers according to [Doc Attributes](https://amc.acdh.oeaw.ac.at/dokumentation/korpusinhalt-attribute/#doc)

In [7]:
headers = soup.find_all('doc')
dict_headers = defaultdict(list)
list_keys = ['id',
             'datum_full',
             'datum',
             'year',
             'yymm',
             'bibl',
             'mediatype',
             'docsrc',
             'docsrc_name',
             'region',
             'province',
             'ressort2',
             'autor', 
             'mutation',
             'deskriptor',
             'keys',             
             'tokens',
             'wordcount',
             'dupl']

for header in headers:
    for key in list_keys:
        if key in header.attrs.keys():
            dict_headers[key].append(header[key])
        else:
            dict_headers[key].append(None)
df_headers = pd.DataFrame(dict_headers)
df_headers.head()

Unnamed: 0,id,datum_full,datum,year,yymm,bibl,mediatype,docsrc,docsrc_name,region,province,ressort2,autor,mutation,deskriptor,keys,tokens,wordcount,dupl
0,APA_19860222_APA0001,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,chronik inland,cm,,,JA_1986 AG_APA RS_CI RS_C DA_19860222 MO_19860...,163,,
1,APA_19860222_APA0002,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,ausland inland medien sport,ro je ru,,,JA_1986 AG_APA RS_II RS_I DA_19860222 MO_19860...,153,,
2,APA_19860222_APA0003,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,inland,ro ko ru,,,JA_1986 AG_APA RS_II RS_I DA_19860222 MO_19860...,241,,
3,APA_19860222_APA0004,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,ausland,ti bf,,,JA_1986 AG_APA RS_AA RS_A DA_19860222 MO_19860...,199,,
4,APA_19860222_APA0005,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,ausland,ti bf,,,JA_1986 AG_APA RS_AA RS_A DA_19860222 MO_19860...,105,,


In [8]:
df_headers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           200 non-null    object
 1   datum_full   200 non-null    object
 2   datum        200 non-null    object
 3   year         0 non-null      object
 4   yymm         0 non-null      object
 5   bibl         200 non-null    object
 6   mediatype    200 non-null    object
 7   docsrc       200 non-null    object
 8   docsrc_name  200 non-null    object
 9   region       200 non-null    object
 10  province     0 non-null      object
 11  ressort2     200 non-null    object
 12  autor        128 non-null    object
 13  mutation     0 non-null      object
 14  deskriptor   0 non-null      object
 15  keys         200 non-null    object
 16  tokens       200 non-null    object
 17  wordcount    0 non-null      object
 18  dupl         0 non-null      object
dtypes: object(19)
memory usage: 2

### Extracting the article fields according to [Token Attributes](https://amc.acdh.oeaw.ac.at/dokumentation/korpusinhalt-attribute/#token)

In [28]:
lines = []
headers = soup.find_all('doc')
for idx, header in enumerate(headers):
    fields = header.find_all('field')
    for field in fields:
    #    for key in field.attrs.keys():
    #    print(f"{key}: {field[key]}")
        name = field['name']
        for satz in field.find_all('s'):
            text = re.sub('<g/>','',satz.text)
            text = re.sub(r'[\n]+','\n',text)
            rows = re.split('\n',text)[1:-1]
            for row in rows:
                list_attribs = re.split('\t',row)
                lines.append([idx, name] + list_attribs)
    
    
df_fields = pd.DataFrame.from_records(lines, columns=['id_article',
                                                      'wert',
                                                      'word',
                                                      'lc',
                                                      'ix',
                                                      'posUDS',
                                                      'posTT',
                                                      'ner',
                                                      'iob',
                                                      'ixNP',
                                                      'dep',
                                                      'ixDEP',
                                                      'neg',
                                                      'lemma',
                                                      'pos',
                                                      'posTI',
                                                      'lempos',
                                                      'lemmasource',
                                                      #'posbase',   ### these fields are in the documentation but not in the verticals
                                                      #'lemma_lc',
                                                      #'posx',
                                                     ])

df_fields = df_fields.reindex(columns=['id_article','wert', 
                                       'word', 'lc', 'lemma', 'lempos', 
                                       'pos', 'posTI', 'posUDS', 'posTT', 
                                       'dep', 'ner', 'ix', 'ixNP', 'ixDEP', 'iob', 
                                       'neg', 'lemmasource'])

df_fields['ix'] = pd.to_numeric(df_fields['ix']).astype(int)
df_fields['ixDEP'] = pd.to_numeric(df_fields['ixDEP'], errors='coerce').astype(int)
df_fields['ixNP'] = pd.to_numeric(df_fields['ixNP'], errors='coerce')
df_fields.head(10)

Unnamed: 0,id_article,wert,word,lc,lemma,lempos,pos,posTI,posUDS,posTT,dep,ner,ix,ixNP,ixDEP,iob,neg,lemmasource
0,0,stichwort,Bunt,bunt,-,bunt-j,ADJD.Pos,ADJD,PROPN,NE,ROOT,ORG,0,0.0,0,B,-,tt
1,0,stichwort,Schweden,schweden,-,Schweden-n,N.Reg.Nom.Pl.Masc,NE,PROPN,NE,ROOT,LOC,0,0.0,0,B,-,tt
2,0,titel,Konkurrenz,konkurrenz,Konkurrenz,Konkurrenz-n,N.Reg.*.Sg.Fem,NN,NOUN,NN,ROOT,,0,0.0,0,O,-,tt
3,0,titel,für,für,-,für-i,APPR.Acc,APPR,ADP,APPR,mnr,,1,,0,O,-,tt
4,0,titel,den,den,-,die-x,ART.Def.Acc.Sg.Masc,ART,DET,ART,nk,,2,1.0,3,O,-,tt
5,0,titel,Vasa-Lauf,vasa-lauf,-,Vasa-Lauf-n,N.Reg.Acc.Sg.Masc,NN,NOUN,NN,nk,,3,1.0,1,O,-,a-b
6,0,inhalt,Utl,utl,-,Utl-j,ADJD.Pos,NE,NOUN,NN,ROOT,,0,0.0,0,O,-,u
7,0,inhalt,.,.,-,.-x,SYM.Pun.Sent,$.,PUNCT,$.,punct,,1,,0,O,-,tt
8,0,inhalt,:,:,-,:-x,SYM.Pun.Colon,$.,PUNCT,$.,ROOT,,2,,2,O,-,tt
9,0,inhalt,Langlaufwettbewerb,langlaufwettbewerb,-,Langlaufwettbewerb-n,N.Reg.Nom.Sg.Masc,NN,NOUN,NN,ROOT,,3,0.0,3,O,-,sU_8


In [29]:
df_fields.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43883 entries, 0 to 43882
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id_article   43883 non-null  int64  
 1   wert         43883 non-null  object 
 2   word         43883 non-null  object 
 3   lc           43883 non-null  object 
 4   lemma        43883 non-null  object 
 5   lempos       43883 non-null  object 
 6   pos          43883 non-null  object 
 7   posTI        43883 non-null  object 
 8   posUDS       43883 non-null  object 
 9   posTT        43883 non-null  object 
 10  dep          43883 non-null  object 
 11  ner          43883 non-null  object 
 12  ix           43883 non-null  int64  
 13  ixNP         11848 non-null  float64
 14  ixDEP        43883 non-null  int64  
 15  iob          43883 non-null  object 
 16  neg          43883 non-null  object 
 17  lemmasource  43883 non-null  object 
dtypes: float64(1), int64(3), object(14)
memory usa

### Displaying the first ten articles

In [127]:
for idx in range(10):
    title = ' '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'titel')]['word'])
    keywords = ', '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'stichwort')]['word'])
    text = ' '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'inhalt')]['word'])

    text = re.sub(r'\s+([?!.,;:)])', r'\1', text)  # delete space(s) before: ?!.,;:)
    text = re.sub(r'([(])\s+', r'\1', text)        # delete space(s) after: (
    text = re.sub('\*', '', text)        # delete: *

    if text.find('Utl.') != -1:
        subtitle = text[text.find('Utl.')+6 : text.find('=')]
        text = text[text.find('=')+2:]
    else:
        subtitle = "None"

    print(f'Title: {title}\n\nSubtitle: {subtitle}\n\nKeywords: {keywords}\n\nText:\n{text}')
    print('\n-------------------------------\n')

Title: Konkurrenz für den Vasa-Lauf

Subtitle: Langlaufwettbewerb auf den Spuren des Schwedenkönigs in Lindholmen 

Keywords: Bunt, Schweden

Text:
Stockholm (APA) - Der weltgrößte Skiwettbewerb, der Vasa-Lauf, der alljährlich Anfang März auf der 85,8 Kilometer langen Strecke von Mora nach Saelen im schwedischen Dalarna ausgetragen wird, hat einen Konkurrenten gefunden: Erstmals wurde in diesem Jahr im Februar in Lindholmen nördlich von Stockholm ein Langlaufwettbewerb zum Gedenken an die Flucht des schwedischen Königs Gustav Vasa nach Norwegen im Jahr 1521 veranstaltet.  Die Teilnehmer an dem Gedenklauf, von dem die ostdeutsche Nachrichtenagentur adn berichtet, benutzten nach historischen Vorbildern angefertigte, aus Bambus geflochtene " Brettln ". Mit solchen Skiern soll der König seinerzeit ins Nachbarland geflohen sein. Die geschichtliche Bedeutung des Laufes unterstreichen die Lindholmener auch mit der These, der König sei in ihrer Stadt geboren. Außer einem Steinhaufen, den Reste