### Importing packages

In [1]:
! pip install -U pandas lxml

Requirement already up-to-date: pandas in /opt/conda/lib/python3.7/site-packages (1.0.3)
Requirement already up-to-date: lxml in /opt/conda/lib/python3.7/site-packages (4.5.0)


In [2]:
import os
import sys
import re
import pickle
import json

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt

import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString

#sys.path.insert(1, '../scripts/utils/')
#from text_filter import spacify_soup

### Retrieving pickle

In [3]:
df_files = pd.read_pickle('../outputs/df_files.pkl')

In [4]:
df_files.head()

Unnamed: 0,path,name,extension,size,folder,depth,parent,uid,main,source,year,date
0,artikel_/APA/1986/APA_19860220.xml,APA_19860220,xml,6142251,False,4,5,1005,artikel_,APA,1986,1986-02-20
1,artikel_/APA/1986/APA_19860102.xml,APA_19860102,xml,3397140,False,4,5,1005,artikel_,APA,1986,1986-01-02
2,artikel_/APA/1986/APA_19860222.xml,APA_19860222,xml,3352934,False,4,5,1005,artikel_,APA,1986,1986-02-22
3,artikel_/APA/1986/APA_19860103.xml,APA_19860103,xml,3794819,False,4,5,1005,artikel_,APA,1986,1986-01-03
4,artikel_/APA/1986/APA_19860221.xml,APA_19860221,xml,5763730,False,4,5,1005,artikel_,APA,1986,1986-02-21


In [5]:
df_files.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175600 entries, 0 to 176739
Data columns (total 12 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   path       175600 non-null  object        
 1   name       175600 non-null  object        
 2   extension  175600 non-null  object        
 3   size       175600 non-null  int64         
 4   folder     175600 non-null  bool          
 5   depth      175600 non-null  int64         
 6   parent     175600 non-null  int64         
 7   uid        175600 non-null  int64         
 8   main       175600 non-null  object        
 9   source     171457 non-null  object        
 10  year       171457 non-null  period[A-DEC] 
 11  date       175600 non-null  datetime64[ns]
dtypes: bool(1), datetime64[ns](1), int64(4), object(5), period[A-DEC](1)
memory usage: 16.2+ MB


### Examining a single file:

In [6]:
path = '/home/jovyan/shared/C_amc_141/R_amc_3.1_12921/203_vert_spacy_rftt/'
idx = 0
print(df_files.loc[idx, 'path'])
filepath = os.path.join(path, df_files.loc[idx, 'path'])


# Parsing with Beautiful Soup
with open(filepath, "r") as f:
    soup = bs(f, "lxml-xml")

artikel_/APA/1986/APA_19860220.xml


### Extracting the article headers according to [Doc Attributes](https://amc.acdh.oeaw.ac.at/dokumentation/korpusinhalt-attribute/#doc)

In [7]:
headers = soup.find_all('doc')
dict_headers = defaultdict(list)
list_keys = ['id',
             'datum_full',
             'datum',
             'year',
             'yymm',
             'bibl',
             'mediatype',
             'docsrc',
             'docsrc_name',
             'region',
             'province',
             'ressort2',
             'autor', 
             'mutation',
             'deskriptor',
             'keys',             
             'tokens',
             'wordcount',
             'dupl']

for header in headers:
    for key in list_keys:
        if key in header.attrs.keys():
            dict_headers[key].append(header[key])
        else:
            dict_headers[key].append(None)
df_headers = pd.DataFrame(dict_headers)
df_headers.head()

Unnamed: 0,id,datum_full,datum,year,yymm,bibl,mediatype,docsrc,docsrc_name,region,province,ressort2,autor,mutation,deskriptor,keys,tokens,wordcount,dupl
0,APA_19860220_APA0001,1986-02-20T00:00:00Z,1986-02-20,,,APA-Meldungen digital vom 1986-02-20,print,APA,APA-Meldungen digital,agesamt,,ausland chronik,wm,,,JA_1986 AG_APA RS_CA RS_C DA_19860220 MO_19860...,292,,
1,APA_19860220_APA0002,1986-02-20T00:00:00Z,1986-02-20,,,APA-Meldungen digital vom 1986-02-20,print,APA,APA-Meldungen digital,agesamt,,ausland kultur,wm,,,JA_1986 AG_APA RS_KA RS_K DA_19860220 MO_19860...,322,,
2,APA_19860220_APA0003,1986-02-20T00:00:00Z,1986-02-20,,,APA-Meldungen digital vom 1986-02-20,print,APA,APA-Meldungen digital,agesamt,,ausland kultur,wm,,,JA_1986 AG_APA RS_KA RS_K DA_19860220 MO_19860...,332,,
3,APA_19860220_APA0004,1986-02-20T00:00:00Z,1986-02-20,,,APA-Meldungen digital vom 1986-02-20,print,APA,APA-Meldungen digital,agesamt,,ausland inland medien sport,hs,,,JA_1986 AG_APA RS_AA RS_A DA_19860220 MO_19860...,146,,
4,APA_19860220_APA0005,1986-02-20T00:00:00Z,1986-02-20,,,APA-Meldungen digital vom 1986-02-20,print,APA,APA-Meldungen digital,agesamt,,ausland,awi hr,,,JA_1986 AG_APA RS_AA RS_A DA_19860220 MO_19860...,168,,


In [8]:
df_headers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           326 non-null    object
 1   datum_full   326 non-null    object
 2   datum        326 non-null    object
 3   year         0 non-null      object
 4   yymm         0 non-null      object
 5   bibl         326 non-null    object
 6   mediatype    326 non-null    object
 7   docsrc       326 non-null    object
 8   docsrc_name  326 non-null    object
 9   region       326 non-null    object
 10  province     0 non-null      object
 11  ressort2     326 non-null    object
 12  autor        252 non-null    object
 13  mutation     0 non-null      object
 14  deskriptor   0 non-null      object
 15  keys         326 non-null    object
 16  tokens       326 non-null    object
 17  wordcount    0 non-null      object
 18  dupl         1 non-null      object
dtypes: object(19)
memory usage: 4

### Extracting the article fields according to [Token Attributes](https://amc.acdh.oeaw.ac.at/dokumentation/korpusinhalt-attribute/#token)

In [9]:
lines = []
headers = soup.find_all('doc')
for idx, header in enumerate(headers):
    fields = header.find_all('field')
    for field in fields:
        name = field['name']
        for satz in field.find_all('s'):
            text = re.sub('<g/>','',satz.text)
            text = re.sub(r'[\n]+','\n',text)
            rows = re.split('\n',text)[1:-1]
            for row in rows:
                list_attribs = re.split('\t',row)
                lines.append([idx, name] + list_attribs)
    
    
df_fields = pd.DataFrame.from_records(lines, columns=['id_article',
                                                      'wert',
                                                      'word',
                                                      'lc',
                                                      'ix',
                                                      'posUDS',
                                                      'posTT',
                                                      'ner',
                                                      'iob',
                                                      'ixNP',
                                                      'dep',
                                                      'ixDEP',
                                                      'neg',
                                                      'lemma',
                                                      'pos',
                                                      'posTI',
                                                      'lempos',
                                                      'lemmasource',
                                                     ])

df_fields = df_fields.reindex(columns=['id_article','wert', 
                                       'word', 'lc', 'lemma', 'lempos', 
                                       'pos', 'posTI', 'posUDS', 'posTT', 
                                       'dep', 'ner', 'ix', 'ixNP', 'ixDEP', 'iob', 
                                       'neg', 'lemmasource'])

df_fields['ix'] = pd.to_numeric(df_fields['ix']).astype(int)
df_fields['ixDEP'] = pd.to_numeric(df_fields['ixDEP'], errors='coerce').astype(int)
df_fields['ixNP'] = pd.to_numeric(df_fields['ixNP'], errors='coerce')
df_fields.head(10)

Unnamed: 0,id_article,wert,word,lc,lemma,lempos,pos,posTI,posUDS,posTT,dep,ner,ix,ixNP,ixDEP,iob,neg,lemmasource
0,0,stichwort,Umwelt,umwelt,Umwelt,Umwelt-n,N.Reg.Nom.Sg.Fem,NN,NOUN,NN,ROOT,,0,0.0,0,O,-,tt
1,0,stichwort,USA,usa,-,USA-n,N.Name.Gen.Sg.*,NE,PROPN,NE,ROOT,LOC,0,0.0,0,B,-,tt
2,0,stichwort,F,f,-,F-n,N.Reg.Nom.Sg.Neut,NN,X,FM,ROOT,,0,,0,O,-,tt
3,0,stichwort,e,e,-,e-n,FM,ADJA,X,FM,uc,,1,,0,O,-,u
4,0,stichwort,a,a,-,a-x,FM,FM,X,FM,uc,,2,,0,O,-,tt
5,0,stichwort,t,t,-,t-n,FM,NN,X,FM,uc,,3,,0,O,-,tt
6,0,stichwort,u,u,-,U-n,N.Name.Nom.Sg.Neut,ADJA,X,FM,uc,,4,,0,O,-,d
7,0,stichwort,r,r,-,R-n,N.Name.Nom.Sg.Neut,ADJA,X,FM,uc,,5,,0,O,-,d
8,0,stichwort,e,e,-,E-n,N.Name.*.*.*,NN,X,FM,uc,,6,,0,O,-,tt
9,0,titel,Geht,geht,gehen,gehen-v,VFIN.Full.3.Sg.Pres.Ind,VVFIN,VERB,VVFIN,ROOT,,0,,0,O,-,tt


In [10]:
df_fields.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79197 entries, 0 to 79196
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id_article   79197 non-null  int64  
 1   wert         79197 non-null  object 
 2   word         79197 non-null  object 
 3   lc           79197 non-null  object 
 4   lemma        79197 non-null  object 
 5   lempos       79197 non-null  object 
 6   pos          79197 non-null  object 
 7   posTI        79197 non-null  object 
 8   posUDS       79197 non-null  object 
 9   posTT        79197 non-null  object 
 10  dep          79197 non-null  object 
 11  ner          79197 non-null  object 
 12  ix           79197 non-null  int64  
 13  ixNP         20973 non-null  float64
 14  ixDEP        79197 non-null  int64  
 15  iob          79197 non-null  object 
 16  neg          79197 non-null  object 
 17  lemmasource  79197 non-null  object 
dtypes: float64(1), int64(3), object(14)
memory usa

### Displaying the first ten articles

In [13]:
for idx in range(0,50):
    # parsing title, keywords and text
    title = ' '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'titel')]['word'])
    keywords = ', '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'stichwort')]['word'])
    text = ' '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'inhalt')]['word'])

    # delete space(s) before: ?!.,;:)
    text = re.sub(r'\s+([?!.,;:)])', r'\1', text)
    # delete space(s) after: (
    text = re.sub(r'([(])\s+', r'\1', text)
    # delete: *
    text = re.sub('\*', '', text)        

    # identifying and extracting subtitle, if available
    if text.find('Utl.') != -1:
        subtitle = text[text.find('Utl.')+6 : text.find('=')]
        text = text[text.find('=')+2:]
    else:
        subtitle = "None"
    
    # identifying and extracting place and news agency
    place_agency = re.findall(r'[\w\s]+\([\w\s\/]+\)[\s-]+', text, re.IGNORECASE)
    if place_agency:
        pos = place_agency[0].find('(')
        place = place_agency[0][:pos].strip()
        agency = place_agency[0][pos:].strip('-() ')
        text = text[len(place_agency[0]):]
    else:
        place = "None"
        agency = "None"
        
    # identifying and extracting (Schluß),(Forts.),(Fortsetzung),(Forts. mögl.), (Forts. mgl.)
    final_notes = re.findall(r'\(Schluß\)|\(Fort[^\n]+\)', text, re.IGNORECASE)
    if final_notes:
        pos = text.find(final_notes[0])
        final_note = text[pos:]
        text = text[:pos]
    else:
        final_note = 'None'
    
    # retrieving some fields from the header
    print(f"Header fields: {df_headers.loc[idx,['id', 'datum']].values}")
    print(f'Place: {place}\nAgency: {agency}')
    print(f'Title: {title}\nSubtitle: {subtitle}\nKeywords: {keywords}\n\nText:\n{text}\n\nFinal Notes: {final_note}')
    print('\n-------------------------------\n')

Header fields: ['APA_19860220_APA0001' '1986-02-20']
Place: New York
Agency: dpa
Title: Geht Amerika unter ?
Subtitle: Experten: Anstieg des Meeresspiegels bedroht die Küsten 
Keywords: Umwelt, USA, F, e, a, t, u, r, e

Text:
Steht den Amerikanern bald das Wasser bis zu Hals? Amerikanische Experten haben jetzt jedenfalls einen Bericht vorgelegt, in dem sie ausführlich auf Gefahren aufmerksam machen, die den Küsten der USA durch den Anstieg des Meeresspiegels drohen. Wie die Zeitung " New York Times " am Dienstag berichtete, rechnen einige Wissenschaftler damit, daß der Meeresspiegel schon in den nächsten drei bis vier Jahrzehnten bis zu 30 Zentimeter steigen könnte. Der in der vergangenen Woche abgeschlossene Expertenbericht entstand im Auftrag der National Academy of Sciences in Washington.  Bei sehr flachen Küstengebieten der USA wie am Golf von Mexiko und an Teilen des Atlantik-Ufers würde das einen Wassereinbruch von meh- reren Hundert Metern bedeuten. Bei der Besiedlung der betrof