### Importing packages

In [1]:
! pip install -U pandas lxml networkx

Requirement already up-to-date: pandas in /opt/conda/lib/python3.7/site-packages (1.0.3)
Requirement already up-to-date: lxml in /opt/conda/lib/python3.7/site-packages (4.5.0)
Requirement already up-to-date: networkx in /opt/conda/lib/python3.7/site-packages (2.4)


In [2]:
import os
import sys
import re
import pickle
import json

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt

import xml.etree.ElementTree as ET
from lxml import etree
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString

#sys.path.insert(1, '../scripts/utils/')
#from text_filter import spacify_soup

### Retrieving pickle

In [3]:
df_files = pd.read_pickle('../outputs/df_files.pkl')

In [4]:
df_files.head()

Unnamed: 0,path,name,extension,size,folder,depth,parent,uid,main,source,year,date
0,artikel_/APA/1986/APA_19860220.xml,APA_19860220,xml,6142251,False,4,5,1005,artikel_,APA,1986,1986-02-20
1,artikel_/APA/1986/APA_19860102.xml,APA_19860102,xml,3397140,False,4,5,1005,artikel_,APA,1986,1986-01-02
2,artikel_/APA/1986/APA_19860222.xml,APA_19860222,xml,3352934,False,4,5,1005,artikel_,APA,1986,1986-02-22
3,artikel_/APA/1986/APA_19860103.xml,APA_19860103,xml,3794819,False,4,5,1005,artikel_,APA,1986,1986-01-03
4,artikel_/APA/1986/APA_19860221.xml,APA_19860221,xml,5763730,False,4,5,1005,artikel_,APA,1986,1986-02-21


In [5]:
df_files.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175600 entries, 0 to 176739
Data columns (total 12 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   path       175600 non-null  object        
 1   name       175600 non-null  object        
 2   extension  175600 non-null  object        
 3   size       175600 non-null  int64         
 4   folder     175600 non-null  bool          
 5   depth      175600 non-null  int64         
 6   parent     175600 non-null  int64         
 7   uid        175600 non-null  int64         
 8   main       175600 non-null  object        
 9   source     171457 non-null  object        
 10  year       171457 non-null  period[A-DEC] 
 11  date       175600 non-null  datetime64[ns]
dtypes: bool(1), datetime64[ns](1), int64(4), object(5), period[A-DEC](1)
memory usage: 16.2+ MB


### Examining a single file:

In [6]:
path = '/home/jovyan/shared/C_amc_141/R_amc_3.1_12921/203_vert_spacy_rftt/'
idx = 2
print(df_files.loc[idx, 'path'])
filepath = os.path.join(path, df_files.loc[idx, 'path'])


# Parsing with Beautiful Soup
with open(filepath, "r") as f:
    soup = bs(f, "lxml-xml")

artikel_/APA/1986/APA_19860222.xml


### Extracting the article headers according to [Doc Attributes](https://amc.acdh.oeaw.ac.at/dokumentation/korpusinhalt-attribute/#doc)

In [7]:
headers = soup.find_all('doc')
dict_headers = defaultdict(list)
list_keys = ['id',
             'datum_full',
             'datum',
             'year',
             'yymm',
             'bibl',
             'mediatype',
             'docsrc',
             'docsrc_name',
             'region',
             'province',
             'ressort2',
             'autor', 
             'mutation',
             'deskriptor',
             'keys',             
             'tokens',
             'wordcount',
             'dupl']

for header in headers:
    for key in list_keys:
        if key in header.attrs.keys():
            dict_headers[key].append(header[key])
        else:
            dict_headers[key].append(None)
df_headers = pd.DataFrame(dict_headers)
df_headers.head()

Unnamed: 0,id,datum_full,datum,year,yymm,bibl,mediatype,docsrc,docsrc_name,region,province,ressort2,autor,mutation,deskriptor,keys,tokens,wordcount,dupl
0,APA_19860222_APA0001,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,chronik inland,cm,,,JA_1986 AG_APA RS_CI RS_C DA_19860222 MO_19860...,163,,
1,APA_19860222_APA0002,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,ausland inland medien sport,ro je ru,,,JA_1986 AG_APA RS_II RS_I DA_19860222 MO_19860...,153,,
2,APA_19860222_APA0003,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,inland,ro ko ru,,,JA_1986 AG_APA RS_II RS_I DA_19860222 MO_19860...,241,,
3,APA_19860222_APA0004,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,ausland,ti bf,,,JA_1986 AG_APA RS_AA RS_A DA_19860222 MO_19860...,199,,
4,APA_19860222_APA0005,1986-02-22T00:00:00Z,1986-02-22,,,APA-Meldungen digital vom 1986-02-22,print,APA,APA-Meldungen digital,agesamt,,ausland,ti bf,,,JA_1986 AG_APA RS_AA RS_A DA_19860222 MO_19860...,105,,


In [8]:
df_headers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           200 non-null    object
 1   datum_full   200 non-null    object
 2   datum        200 non-null    object
 3   year         0 non-null      object
 4   yymm         0 non-null      object
 5   bibl         200 non-null    object
 6   mediatype    200 non-null    object
 7   docsrc       200 non-null    object
 8   docsrc_name  200 non-null    object
 9   region       200 non-null    object
 10  province     0 non-null      object
 11  ressort2     200 non-null    object
 12  autor        128 non-null    object
 13  mutation     0 non-null      object
 14  deskriptor   0 non-null      object
 15  keys         200 non-null    object
 16  tokens       200 non-null    object
 17  wordcount    0 non-null      object
 18  dupl         0 non-null      object
dtypes: object(19)
memory usage: 2

### Extracting the article fields according to [Token Attributes](https://amc.acdh.oeaw.ac.at/dokumentation/korpusinhalt-attribute/#token)

In [9]:
lines = []
fields = soup.find_all('field')
for field in fields:
#    for key in field.attrs.keys():
#    print(f"{key}: {field[key]}")
    name = field['name']
    for satz in field.find_all('s'):
        text = re.sub('<g/>','',satz.text)
        text = re.sub(r'[\n]+','\n',text)
        rows = re.split('\n',text)[1:-1]
        for row in rows:
            list_attribs = re.split('\t',row)
            list_attribs.insert(0, name)
            lines.append(list_attribs)
    
    
df_fields = pd.DataFrame.from_records(lines, columns=['wert',
                                                      'word',
                                                      'lc',
                                                      'ix',
                                                      'posUDS',
                                                      'posTT',
                                                      'ner',
                                                      'iob',
                                                      'ixNP',
                                                      'dep',
                                                      'ixDEP',
                                                      'neg',
                                                      'lemma',
                                                      'pos',
                                                      'posTI',
                                                      'lempos',
                                                      'lemmasource',
                                                      #'posbase',   ### these fields are in the documentation but not in the verticals
                                                      #'lemma_lc',
                                                      #'posx',
                                                     ])

df_fields = df_fields.reindex(columns=['wert', 'word', 'lc', 'lemma', 'lempos',
                                       'pos', 'posTI','posUDS', 'posTT', 'dep', 'ner',
                                       'ix', 'ixNP', 'ixDEP', 'iob','neg',  'lemmasource'])

df_fields['ix'] = pd.to_numeric(df_fields['ix']).astype(int)
df_fields['ixDEP'] = pd.to_numeric(df_fields['ixDEP'], errors='coerce').astype(int)
df_fields['ixNP'] = pd.to_numeric(df_fields['ixNP'], errors='coerce')
df_fields.head(20)

Unnamed: 0,wert,word,lc,lemma,lempos,pos,posTI,posUDS,posTT,dep,ner,ix,ixNP,ixDEP,iob,neg,lemmasource
0,stichwort,Bunt,bunt,-,bunt-j,ADJD.Pos,ADJD,PROPN,NE,ROOT,ORG,0,0.0,0,B,-,tt
1,stichwort,Schweden,schweden,-,Schweden-n,N.Reg.Nom.Pl.Masc,NE,PROPN,NE,ROOT,LOC,0,0.0,0,B,-,tt
2,titel,Konkurrenz,konkurrenz,Konkurrenz,Konkurrenz-n,N.Reg.*.Sg.Fem,NN,NOUN,NN,ROOT,,0,0.0,0,O,-,tt
3,titel,für,für,-,für-i,APPR.Acc,APPR,ADP,APPR,mnr,,1,,0,O,-,tt
4,titel,den,den,-,die-x,ART.Def.Acc.Sg.Masc,ART,DET,ART,nk,,2,1.0,3,O,-,tt
5,titel,Vasa-Lauf,vasa-lauf,-,Vasa-Lauf-n,N.Reg.Acc.Sg.Masc,NN,NOUN,NN,nk,,3,1.0,1,O,-,a-b
6,inhalt,Utl,utl,-,Utl-j,ADJD.Pos,NE,NOUN,NN,ROOT,,0,0.0,0,O,-,u
7,inhalt,.,.,-,.-x,SYM.Pun.Sent,$.,PUNCT,$.,punct,,1,,0,O,-,tt
8,inhalt,:,:,-,:-x,SYM.Pun.Colon,$.,PUNCT,$.,ROOT,,2,,2,O,-,tt
9,inhalt,Langlaufwettbewerb,langlaufwettbewerb,-,Langlaufwettbewerb-n,N.Reg.Nom.Sg.Masc,NN,NOUN,NN,ROOT,,3,0.0,3,O,-,sU_8


In [10]:
df_fields.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43883 entries, 0 to 43882
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   wert         43883 non-null  object 
 1   word         43883 non-null  object 
 2   lc           43883 non-null  object 
 3   lemma        43883 non-null  object 
 4   lempos       43883 non-null  object 
 5   pos          43883 non-null  object 
 6   posTI        43883 non-null  object 
 7   posUDS       43883 non-null  object 
 8   posTT        43883 non-null  object 
 9   dep          43883 non-null  object 
 10  ner          43883 non-null  object 
 11  ix           43883 non-null  int64  
 12  ixNP         11848 non-null  float64
 13  ixDEP        43883 non-null  int64  
 14  iob          43883 non-null  object 
 15  neg          43883 non-null  object 
 16  lemmasource  43883 non-null  object 
dtypes: float64(1), int64(2), object(14)
memory usage: 5.7+ MB
