In [1]:
import pandas as pd
import requests
import unicodedata
from bs4 import BeautifulSoup as soup
from datetime import datetime

raw_path = '../raw'

In [20]:
df = pd.read_excel(raw_path + "/HABV_93_Ehrenamtskartei.xlsx", names=['signatur','nachname','vorname','geburt','tod','umfang'], dtype={'signatur':'string','nachname':'string','vorname':'string','geburt':'string','tod':'string','umfang':'string',})
#df = pd.read_csv(raw_path + "/ehrenamtskartei.csv", dtype={'signatur':'string','nachname':'string','vorname':'string','geburt':'string','tod':'string','umfang':'string',})
df = df.apply(lambda x: x.str.strip() if (x.dtype == "object") or (x.dtype == "string") else x)

# GND-Suche

In [21]:

base_url = 'https://services.dnb.de/sru/authorities?'
params = {'recordSchema' : 'MARC21-xml',
          'operation': 'searchRetrieve',
          'version': '1.1',
          'maximumRecords': '100',
         }

In [22]:

def gnd_abfrage(row):
    params.update({'query': f'PER = "{row.nachname}, {row.vorname}" AND BBG=Tp*'})
    response = requests.get(base_url, params=params)
    response_xml = soup(response.content, features="xml")    
    try:
        matches = list()
        for record in response_xml.find_all('record', {'type':'Authority'}):
            record_match = dict()
            #idn bestimmen
            try:
                record_match['idn'] = record.find('controlfield', {'tag': '001'}).string.strip()
            except:
                record_match['idn'] = None

            # exaktes datum datx finden
            try:
                datx = record.find('subfield', {'code': '4'}, string="datx").parent.find('subfield', {'code': 'a'}).string
            except:
                datx = ''
            
            # jahresdatum in 100 finden
            try:
                datl_100 = record.find('datafield', {'tag': '100'}).find('subfield', {'code': 'd'}).string
            except:
                datl_100 = ''
            
            # jahresdatum in 548 datl finden
            try:
                datl_548 = record.find('subfield', {'code': '4'}, string="datl").parent.find('subfield', {'code': 'a'}).string
            except:
                datl_548 = ''

            #setzen von datum_match, wenn geburtsjahre übereinstimmen

            if datx == f"{row.geburt}-{row.tod}":
                record_match['datum_match'] = True
            elif datl_100.split('-')[0] == row.geburt[-4] and datl_100.split('-')[1] == row.tod[-4]:
                record_match['datum_match'] = True
            elif datl_548.split('-')[0] == row.geburt[-4] and datl_548.split('-')[1] == row.tod[-4]:
                record_match['datum_match'] = True
            else:
                record_match['datum_match'] = False

            matches.append(record_match)
        
        return matches
    except Exception as e:
        return "fehler %r" % e

In [23]:
def gnd_auswertung(matches):
    if type(matches) == list:
        return [match['idn'] for match in matches if match['datum_match'] == True]
        # return ', '.join([match['idn'] for match in matches if match['datum_match'] == True])

In [24]:
# idn abfrage starten und ergebnisse in neue spalte matches schreiben
df['matches'] = df.apply(gnd_abfrage, axis=1)

In [25]:
# auswertung starten und matchende idns in neue spalte idns schreiben
df['idns'] = df.matches.apply(gnd_auswertung)

In [26]:
def pica_schreiben(row):
    pica = f"""0500 Qd
0501 Text$btxt
0502 ohne Hilfsmittel zu benutzen$bn
0503 Blatt$bnb
0600 yy
"""
    if pd.notna(row.tod):
        pica += f"1100 {row.tod[-4:]}$n[nicht nach {row.tod[-4:]}]\n"
    else:
        pica += f"1100 \n"

    pica += f"""1130 TB-papier
1130 !040445224!
1131 !041466098!
1132 a1-analog;f1-text;f2-blatt;v-cont
1500 /1ger
1700 /1XA-DE
3100 !000073369!$BZusammenstellender$4com
4000 Ehrenamtsakte {row.vorname} {row.nachname}
4019 Ehrenamtsakte$Bobja
4060 {row.umfang}
4105 !1061153886!
4700 |BSM|
5100 !000073369!
"""
    if len(row.idns) > 2:
        pica += f"5101 !{row.idns[2:-2]}!\n"

    
    pica += f"""5590 [Objektgattung]
5590 !040009254!
7001 {now.strftime('%d-%m-%y')} : x
4800 !009033645!
6710 !1265505004!$l{row.signatur.split(",")[1]}
7100 HA/BV 93,{row.signatur.split(",")[1]} @ m
7109 !!DBSM/S!! ; HA/BV 93,{row.signatur.split(",")[1]}
\t\n"""
    return pica


In [55]:
now = datetime.now()
for index, row in df[:5].iterrows():
    print(pica_schreiben(row))

AttributeError: 'str' object has no attribute 'str'

In [27]:
now = datetime.now()
with open(f"../dat/ehrenamstkartei-{now.strftime('%y-%m-%d-%H-%M-%S')}.dat","w") as f:
    with open(f"../dat/ehrenamstkartei-recent.dat", 'w') as f2:
        for index, row in df.iterrows():
            f.write(pica_schreiben(row))
            f2.write(pica_schreiben(row))

with open(f"../dat/ehrenamstkartei-recent-sample.dat", 'w') as f:
    for index, row in df.sample(5).iterrows():
        f.write(pica_schreiben(row))