In [1]:
import pandas as pd
import requests
import unicodedata
from bs4 import BeautifulSoup as soup

In [2]:
raw_path = '../raw'    

In [3]:
df = pd.read_csv(raw_path + '/' + 'ehrenamtskartei-clean.csv')
df

Unnamed: 0,nachname,vorname,lebensdaten,umfang,geburt,tod
0,Adrian,Werner,21.04.1915-18.08.2001,,21.04.1915,18.08.2001
1,Anrich,Gerold,24.11.1942-04.10.2013,,24.11.1942,04.10.2013
2,Arndt,Martin,02.10.1917-02.03.2005,,02.10.1917,02.03.2005
3,Assmus,Ursula,05.06.1921-26.02.2017,,05.06.1921,26.02.2017
4,Bader,Thomas,1942-25.03.2014,,1942,25.03.2014
...,...,...,...,...,...,...
499,Zahrnt,Heinz,31.05.1915-01.11.2003,,31.05.1915,01.11.2003
500,Zill,Wolfgang,10.12.1928-16.06.1986,,10.12.1928,16.06.1986
501,Zipp,Marianne,14.03.1920-01.10.1983,,14.03.1920,01.10.1983
502,Zirk,Heinz,21.01.1947-09.08.2012,,21.01.1947,09.08.2012


# GND-Suche

In [4]:

base_url = 'https://services.dnb.de/sru/authorities?'
params = {'recordSchema' : 'MARC21-xml',
          'operation': 'searchRetrieve',
          'version': '1.1',
          'maximumRecords': '100',
         }

In [18]:

def gnd_abfrage(row):
    params.update({'query': f'PER = "{row.nachname}, {row.vorname}" AND BBG=Tp*'})
    response = requests.get(base_url, params=params)
    response_xml = soup(response.content)    
    try:
        matches = list()
        for record in response_xml.find_all('record', {'type':'Authority'}):
            record_match = dict()
            #idn bestimmen
            try:
                record_match['idn'] = record.find('controlfield', {'tag': '001'}).string.strip()
            except:
                record_match['idn'] = None

            # exaktes datum datx finden
            try:
                datx = record.find('subfield', {'code': '4'}, string="datx").parent.find('subfield', {'code': 'a'}).string
            except:
                datx = ''
            
            # jahresdatum in 100 finden
            try:
                datl_100 = record.find('datafield', {'tag': '100'}).find('subfield', {'code': 'd'}).string
            except:
                datl_100 = ''
            
            # jahresdatum in 548 datl finden
            try:
                datl_548 = record.find('subfield', {'code': '4'}, string="datl").parent.find('subfield', {'code': 'a'}).string
            except:
                datl_548 = ''

            #setzen von datum_match, wenn geburtsjahre übereinstimmen

            if datx == row.lebensdaten:
                record_match['datum_match'] = True
            elif datl_100.split('-')[0] == row.geburt[-4] and datl_100.split('-')[1] == row.tod[-4]:
                record_match['datum_match'] = True
            elif datl_548.split('-')[0] == row.geburt[-4] and datl_548.split('-')[1] == row.tod[-4]:
                record_match['datum_match'] = True
            else:
                record_match['datum_match'] = False

            matches.append(record_match)
        
        return matches
    except Exception as e:
        return "fehler %r" % e

In [19]:
df['matches'] = df.apply(gnd_abfrage, axis=1)

In [20]:
df

Unnamed: 0,nachname,vorname,lebensdaten,umfang,geburt,tod,matches
0,Adrian,Werner,21.04.1915-18.08.2001,,21.04.1915,18.08.2001,"[{'idn': '1116572036', 'datum_match': False}, ..."
1,Anrich,Gerold,24.11.1942-04.10.2013,,24.11.1942,04.10.2013,"[{'idn': '120243695', 'datum_match': False}]"
2,Arndt,Martin,02.10.1917-02.03.2005,,02.10.1917,02.03.2005,"[{'idn': '1033079022', 'datum_match': False}, ..."
3,Assmus,Ursula,05.06.1921-26.02.2017,,05.06.1921,26.02.2017,"[{'idn': '116371943', 'datum_match': True}]"
4,Bader,Thomas,1942-25.03.2014,,1942,25.03.2014,"[{'idn': '1020322829', 'datum_match': False}, ..."
...,...,...,...,...,...,...,...
499,Zahrnt,Heinz,31.05.1915-01.11.2003,,31.05.1915,01.11.2003,"[{'idn': '11863609X', 'datum_match': True}]"
500,Zill,Wolfgang,10.12.1928-16.06.1986,,10.12.1928,16.06.1986,"[{'idn': '1062130359', 'datum_match': False}]"
501,Zipp,Marianne,14.03.1920-01.10.1983,,14.03.1920,01.10.1983,[]
502,Zirk,Heinz,21.01.1947-09.08.2012,,21.01.1947,09.08.2012,[]


In [24]:
def gnd_auswertung(matches):
    return [match['idn'] for match in matches if match['datum_match'] == True]

df['idns'] = df.matches.apply(gnd_auswertung)

In [25]:
df

Unnamed: 0,nachname,vorname,lebensdaten,umfang,geburt,tod,matches,idns
0,Adrian,Werner,21.04.1915-18.08.2001,,21.04.1915,18.08.2001,"[{'idn': '1116572036', 'datum_match': False}, ...",[133012964]
1,Anrich,Gerold,24.11.1942-04.10.2013,,24.11.1942,04.10.2013,"[{'idn': '120243695', 'datum_match': False}]",[]
2,Arndt,Martin,02.10.1917-02.03.2005,,02.10.1917,02.03.2005,"[{'idn': '1033079022', 'datum_match': False}, ...",[]
3,Assmus,Ursula,05.06.1921-26.02.2017,,05.06.1921,26.02.2017,"[{'idn': '116371943', 'datum_match': True}]",[116371943]
4,Bader,Thomas,1942-25.03.2014,,1942,25.03.2014,"[{'idn': '1020322829', 'datum_match': False}, ...",[]
...,...,...,...,...,...,...,...,...
499,Zahrnt,Heinz,31.05.1915-01.11.2003,,31.05.1915,01.11.2003,"[{'idn': '11863609X', 'datum_match': True}]",[11863609X]
500,Zill,Wolfgang,10.12.1928-16.06.1986,,10.12.1928,16.06.1986,"[{'idn': '1062130359', 'datum_match': False}]",[]
501,Zipp,Marianne,14.03.1920-01.10.1983,,14.03.1920,01.10.1983,[],[]
502,Zirk,Heinz,21.01.1947-09.08.2012,,21.01.1947,09.08.2012,[],[]


In [8]:
params.update({'query': f'PER = "Adrian, Werner" AND BBG=Tp*'})
response = requests.get(base_url, params=params)
response_xml = soup(response.content)
response_xml.find('record', {'type':'Authority'}).find('subfield', {'code':'4'}, string="datl").parent.find('subfield', {'code':'a'}).string.strip()
# < code="4">datl</subfield>


'1983-'

In [26]:
df.to_csv(raw_path + '/' + 'ehrenamtskartei-gnd.csv', index=False)