### Importing packages

In [1]:
! pip install -Uq pandas lxml beautifulsoup4

In [2]:
import os
import sys
import re
import pickle
import json

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt

import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString

#sys.path.insert(1, '../scripts/utils/')
#from text_filter import spacify_soup

### Retrieving pickle

In [3]:
df_files = pd.read_pickle('../outputs/df_files.pkl')

In [4]:
df_files.head()

Unnamed: 0,path,name,extension,size,folder,depth,parent,uid,main,source,year,date
0,artikel_/APA/1986/APA_19860220.xml,APA_19860220,xml,6142251,False,4,5,1005,artikel_,APA,1986,1986-02-20
1,artikel_/APA/1986/APA_19860102.xml,APA_19860102,xml,3397140,False,4,5,1005,artikel_,APA,1986,1986-01-02
2,artikel_/APA/1986/APA_19860222.xml,APA_19860222,xml,3352934,False,4,5,1005,artikel_,APA,1986,1986-02-22
3,artikel_/APA/1986/APA_19860103.xml,APA_19860103,xml,3794819,False,4,5,1005,artikel_,APA,1986,1986-01-03
4,artikel_/APA/1986/APA_19860221.xml,APA_19860221,xml,5763730,False,4,5,1005,artikel_,APA,1986,1986-02-21


In [5]:
df_files.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175600 entries, 0 to 176739
Data columns (total 12 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   path       175600 non-null  object        
 1   name       175600 non-null  object        
 2   extension  175600 non-null  object        
 3   size       175600 non-null  int64         
 4   folder     175600 non-null  bool          
 5   depth      175600 non-null  int64         
 6   parent     175600 non-null  int64         
 7   uid        175600 non-null  int64         
 8   main       175600 non-null  object        
 9   source     171457 non-null  object        
 10  year       171457 non-null  period[A-DEC] 
 11  date       175600 non-null  datetime64[ns]
dtypes: bool(1), datetime64[ns](1), int64(4), object(5), period[A-DEC](1)
memory usage: 16.2+ MB


### Examining a single file:

In [6]:
path = '/home/jovyan/shared/C_amc_141/R_amc_3.1_12921/203_vert_spacy_rftt/'
idx = 100001
print(df_files.loc[idx, 'path'])
filepath = os.path.join(path, df_files.loc[idx, 'path'])


# Parsing with Beautiful Soup
with open(filepath, "r") as f:
    soup = bs(f, "lxml-xml")

artikel_/SN/2013/SN_20130220.xml


### Extracting the article headers according to [Doc Attributes](https://amc.acdh.oeaw.ac.at/dokumentation/korpusinhalt-attribute/#doc)

In [7]:
headers = soup.find_all('doc')
dict_headers = defaultdict(list)
list_keys = ['id',
             'datum_full',
             'datum',
             'year',
             'yymm',
             'bibl',
             'mediatype',
             'docsrc',
             'docsrc_name',
             'region',
             'province',
             'ressort2',
             'autor', 
             'mutation',
             'deskriptor',
             'keys',             
             'tokens',
             'wordcount',
             'dupl']

for header in headers:
    for key in list_keys:
        if key in header.attrs.keys():
            dict_headers[key].append(header[key])
        else:
            dict_headers[key].append(None)
df_headers = pd.DataFrame(dict_headers)
df_headers.head()

Unnamed: 0,id,datum_full,datum,year,yymm,bibl,mediatype,docsrc,docsrc_name,region,province,ressort2,autor,mutation,deskriptor,keys,tokens,wordcount,dupl
0,SN_20130220232132295910001,2013-02-20T00:00:00Z,2013-02-20,,,Salzburger Nachrichten 43 vom 2013-02-20 s. 1,print,SN,Salzburger Nachrichten,agesamt,,seite seite1,,Österreich,,SE_1 DA_20130220 MO_201302 JA_2013 RS_S1 DB_SN,338,,
1,SN_20130220232132295910002,2013-02-20T00:00:00Z,2013-02-20,,,Salzburger Nachrichten 43 vom 2013-02-20 s. 1,print,SN,Salzburger Nachrichten,agesamt,,seite seite1,,Österreich,,SE_1 DA_20130220 MO_201302 JA_2013 RS_S1 DB_SN,61,,
2,SN_20130220232132295910003,2013-02-20T00:00:00Z,2013-02-20,,,Salzburger Nachrichten 43 vom 2013-02-20 s. 1,print,SN,Salzburger Nachrichten,agesamt,,seite seite1,,Österreich,,SE_1 DA_20130220 MO_201302 JA_2013 RS_S1 DB_SN,73,,
3,SN_20130220232132295910004,2013-02-20T00:00:00Z,2013-02-20,,,Salzburger Nachrichten 43 vom 2013-02-20 s. 1,print,SN,Salzburger Nachrichten,agesamt,,seite seite1,,Österreich,,SE_1 DA_20130220 MO_201302 JA_2013 RS_S1 DB_SN,248,,
4,SN_20130220232132295910005,2013-02-20T00:00:00Z,2013-02-20,,,Salzburger Nachrichten 43 vom 2013-02-20 s. 1,print,SN,Salzburger Nachrichten,agesamt,,seite seite1,,Österreich,,SE_1 DA_20130220 MO_201302 JA_2013 RS_S1 DB_SN,55,,


In [8]:
df_headers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202 entries, 0 to 201
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           202 non-null    object
 1   datum_full   202 non-null    object
 2   datum        202 non-null    object
 3   year         0 non-null      object
 4   yymm         0 non-null      object
 5   bibl         202 non-null    object
 6   mediatype    202 non-null    object
 7   docsrc       202 non-null    object
 8   docsrc_name  202 non-null    object
 9   region       202 non-null    object
 10  province     0 non-null      object
 11  ressort2     202 non-null    object
 12  autor        34 non-null     object
 13  mutation     111 non-null    object
 14  deskriptor   52 non-null     object
 15  keys         202 non-null    object
 16  tokens       202 non-null    object
 17  wordcount    0 non-null      object
 18  dupl         6 non-null      object
dtypes: object(19)
memory usage: 3

### Extracting the article fields according to [Token Attributes](https://amc.acdh.oeaw.ac.at/dokumentation/korpusinhalt-attribute/#token)

In [9]:
lines = []
headers = soup.find_all('doc')
for idx, header in enumerate(headers):
    fields = header.find_all('field')
    for field in fields:
        name = field['name']
        for satz in field.find_all('s'):
            text = re.sub('<g/>','',satz.text)
            text = re.sub(r'[\n]+','\n',text)
            rows = re.split('\n',text)[1:-1]
            for row in rows:
                list_attribs = re.split('\t',row)
                lines.append([idx, name] + list_attribs)
    
    
df_fields = pd.DataFrame.from_records(lines, columns=['id_article',
                                                      'wert',
                                                      'word',
                                                      'lc',
                                                      'ix',
                                                      'posUDS',
                                                      'posTT',
                                                      'ner',
                                                      'iob',
                                                      'ixNP',
                                                      'dep',
                                                      'ixDEP',
                                                      'neg',
                                                      'lemma',
                                                      'pos',
                                                      'posTI',
                                                      'lempos',
                                                      'lemmasource',
                                                     ])

df_fields = df_fields.reindex(columns=['id_article','wert', 
                                       'word', 'lc', 'lemma', 'lempos', 
                                       'pos', 'posTI', 'posUDS', 'posTT', 
                                       'dep', 'ner', 'ix', 'ixNP', 'ixDEP', 'iob', 
                                       'neg', 'lemmasource'])

df_fields['ix'] = pd.to_numeric(df_fields['ix']).astype(int)
df_fields['ixDEP'] = pd.to_numeric(df_fields['ixDEP'], errors='coerce').astype(int)
df_fields['ixNP'] = pd.to_numeric(df_fields['ixNP'], errors='coerce')
df_fields.head(10)

Unnamed: 0,id_article,wert,word,lc,lemma,lempos,pos,posTI,posUDS,posTT,dep,ner,ix,ixNP,ixDEP,iob,neg,lemmasource
0,0,titel,Cyber-Angriffe,cyber-angriffe,-,Cyber-Angriff-n,N.Reg.Nom.Pl.Masc,NN,NOUN,NN,sb,,0,0.0,1,O,-,a-b
1,0,titel,bedrohen,bedrohen,bedrohen,bedrohen-v,VFIN.Full.3.Pl.Pres.Ind,VVFIN,VERB,VVFIN,ROOT,,1,,1,O,-,tt
2,0,titel,unsere,unsere,-,unser-p,PRO.Poss.Attr.-.Acc.Sg.Fem,PPOSAT,DET,PPOSAT,nk,,2,1.0,3,O,-,tt
3,0,titel,Sicherheit,sicherheit,Sicherheit,Sicherheit-n,N.Reg.Acc.Sg.Fem,NN,NOUN,NN,oa,,3,1.0,1,O,-,tt
4,0,inhalt,Schweigen,schweigen,Schweigen,Schweigen-n,N.Reg.Nom.Sg.Neut,NN,NOUN,NN,sb,,0,0.0,2,O,-,tt
5,0,inhalt,allein,allein,-,allein-r,ADV,ADV,ADV,ADV,mo,,1,,0,O,-,tt
6,0,inhalt,ist,ist,sein,sein-v,VFIN.Sein.3.Sg.Pres.Ind,VAFIN,AUX,VAFIN,ROOT,,2,,2,O,-,tt
7,0,inhalt,keine,keine,-,keine-p,PRO.Indef.Attr.-.Nom.Sg.Fem,PIAT,DET,PIAT,nk,,3,1.0,4,O,-,tt
8,0,inhalt,Alternative,alternative,Alternative,Alternative-n,N.Reg.Nom.Sg.Fem,NN,NOUN,NN,pd,,4,1.0,2,O,-,tt
9,0,inhalt,.,.,-,.-x,SYM.Pun.Sent,$.,PUNCT,$.,punct,,5,,2,O,-,tt


In [10]:
df_fields.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50019 entries, 0 to 50018
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id_article   50019 non-null  int64  
 1   wert         50019 non-null  object 
 2   word         50019 non-null  object 
 3   lc           50019 non-null  object 
 4   lemma        50019 non-null  object 
 5   lempos       50019 non-null  object 
 6   pos          50019 non-null  object 
 7   posTI        50019 non-null  object 
 8   posUDS       50019 non-null  object 
 9   posTT        50019 non-null  object 
 10  dep          50019 non-null  object 
 11  ner          50019 non-null  object 
 12  ix           50019 non-null  int64  
 13  ixNP         10624 non-null  float64
 14  ixDEP        50019 non-null  int64  
 15  iob          50019 non-null  object 
 16  neg          50019 non-null  object 
 17  lemmasource  50019 non-null  object 
dtypes: float64(1), int64(3), object(14)
memory usa

### Displaying the first ten articles

In [11]:
for idx in range(0,50):
    # parsing title, keywords and text
    title = ' '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'titel')]['word'])
    keywords = ', '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'stichwort')]['word'])
    text = ' '.join(df_fields[(df_fields.id_article == idx) & (df_fields.wert == 'inhalt')]['word'])

    # delete space(s) before: ?!.,;:)
    text = re.sub(r'\s+([?!.,;:)])', r'\1', text)
    # delete space(s) after: (
    text = re.sub(r'([(])\s+', r'\1', text)
    # delete: *
    text = re.sub('\*', '', text)        

    # identifying and extracting subtitle, if available
    if text.find('Utl.') != -1:
        subtitle = text[text.find('Utl.')+6 : text.find('=')]
        text = text[text.find('=')+2:]
    else:
        subtitle = "None"
    
    # identifying and extracting place and news agency
    place_agency_apa = re.compile(r'^[\w\s]+\([\w\s\/]+\)[\s-]+',re.IGNORECASE)
    place_agency = re.findall(place_agency_apa, text)
    if place_agency:
        pos = place_agency[0].find('(')
        place = place_agency[0][:pos].strip()
        agency = place_agency[0][pos:].strip('-() ')
        text = text[len(place_agency[0]):]
    else:
        place = "None"
        agency = "None"
        
    # identifying and extracting final notes 
    # e.g. (Schluß),(Forts.),(Fortsetzung),(Forts. mögl.), (Forts. mgl.)
    final_notes_apa = re.compile(r'\(Schluß\)|\(Fort[^\n]+\)',re.IGNORECASE)
    final_notes_sn = re.compile(r'(Bild[.:\/0-9A-Za-z, ]{0,30})$|(Seit[en ]*[.:\/0-9A-Za-z, ]{0,30})$')
    final_notes = re.findall(final_notes_sn, text)
    if final_notes:
        if isinstance(final_notes[0], tuple):
            final_notes[0] = final_notes[0][-1]
        pos = text.find(final_notes[0])
        final_note = text[pos:]
        text = text[:pos]
    else:
        final_note = 'None'
    
    # retrieving some fields from the header
    print(f"Header fields: {df_headers.loc[idx,['id', 'datum']].values}")
    print(f'Place: {place}\nAgency: {agency}')
    print(f'Title: {title}\nSubtitle: {subtitle}\nKeywords: {keywords}\n\nText:\n{text}\n\nFinal Notes: {final_note}')
    print('\n-------------------------------\n')

Header fields: ['SN_20130220232132295910001' '2013-02-20']
Place: None
Agency: None
Title: Cyber-Angriffe bedrohen unsere Sicherheit
Subtitle: None
Keywords: 

Text:
Schweigen allein ist keine Alternative. China muss wissen, dass Cyber-Attacken aus Schanghai nicht länger toleriert werden. Die jüngsten Erkenntnisse über die Urheber Dutzender Cyber-Attacken auf sensible Ziele in den USA sollten alle Alarmglocke läuten lassen. Angriffe auf lebenswichtige Infrastruktur, Staatsgeheimnisse und intellektuelles Eigentum von Unternehmen bedrohen unsere Sicherheit und unseren Wohlstand viel unmittelbarer als Bomben und Raketen. Die Schwelle für die Aggressoren scheint niedriger, die Konsequenzen wirken weniger folgenschwer. Umgekehrt erweist es sich sehr viel komplizierter, die Hacker dingfest zu machen. Erst recht, wenn es sich um staatlich unterstützte Cyber-Krieger handelt. Erstmals liegt nun in der Öffentlichkeit eine Studie vor, die eine Einheit der chinesischen Volksarmee konkret beschuldi