# Start

In [178]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [179]:
import pandas as pd
import plotly.express as px
import regex as re
import time
import random

from tqdm.notebook import tqdm

In [180]:
test_links = [
    'https://d-nb.info/gnd/118505602', # Bachmann
    'https://d-nb.info/gnd/118516906', # Büchner
    'https://d-nb.info/gnd/118519859', # Celan
    'https://d-nb.info/gnd/118523392', # Dahn
    'https://d-nb.info/gnd/118527908', # Dürrenmatt
    'https://d-nb.info/gnd/118536109', # Frisch
    'https://d-nb.info/gnd/118585916', # Musil
]

In [181]:
gnd_authors_to_standard_authors = {
    "Aesopus": "Äsop",
    'Apuleius, Madaurensis': 'Apuleius',
    'Ava, Frau' : 'Frau Ava',
    'Dietmar, von Aist': 'Dietmar von Aist',
    'Eckhart, Meister': 'Meister Eckhart',
    'Elisabeth, Nassau-Saarbrücken, Gräfin': 'Elisabeth von Lothringen',
    'Erasmus, Desiderius': 'Erasmus von Rotterdam',
    'Feuerbach, Paul Johann Anselm, Ritter von' : 'Feuerbach, Anselm von',
    'Fouqué, Caroline de La Motte-' : 'Fouqué, Caroline de La Motte',
    'Fouqué, Friedrich de La Motte-' : 'Fouqué, Friedrich de la Motte',
    'Gottfried, von Straßburg' : 'Gottfried von Straßburg',
    'Hahn-Hahn, Ida, Gräfin': 'Ida Hahn-Hahn',
    'Hartmann, von Aue' : 'Hartmann von Aue',
    'Heinrich, der Gleißner' : 'Heinrich der Gleißner',
    'Heinrich, von Meißen' : 'Frauenlob',
    'Heinrich, von Morungen' : 'Heinrich von Morungen',
    'Heinrich, von Veldeke' : 'Heinrich von Veldeke',
    "Homerus" : "Homer",
    'Knigge, Adolph, Freiherr': 'Knigge, Adolph',
    'Konrad, der Pfaffe': 'Pfaffe Konrad',
    'Konrad, von Würzburg': 'Konrad von Würzburg',
    'Lamprecht, der Pfaffe' : 'Pfaffe Lamprecht',
    "Marlitt, E." : "Marlitt, Eugenie",
    "Mechthild, von Magdeburg": "Mechthild von Magdeburg",
    "Otfrid, von Weißenburg": "Otfrid von Weißenburg",
    'Platen, August, Graf von' : 'Platen, August von',
    "Reinbot, von Durne": "Reinbot von Durne",
    "Reinmar, der Alte": "Reinmar der Alte",
    "Seneca, Lucius Annaeus, Philosophus": "Seneca",
    "Schlegel, Dorothea von": "Schlegel, Dorothea",
    "Sophocles": "Sophokles",
    'Thüring, von Ringoltingen' : 'Thüring von Ringoltingen',
    'Ulrich, von Lichtenstein' : 'Ulrich von Liechtenstein',
    "Vergilius Maro, Publius": "Vergil",
    'Walther, von der Vogelweide' : 'Walther von der Vogelweide',
    "Werner, der Gärtner": "Werner der Gärtner",
    "Wolfram, von Eschenbach": "Wolfram von Eschenbach",
}

In [182]:
data_authors = pd.DataFrame()
data_authors.index.name = 'GND'

# Leselisten

### create leselisten_authors from raw

In [183]:
mode = 'gewichtet'

In [184]:
leselisten_titles = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/raw/leselisten_raw.csv", sep = ";")

In [185]:
# Filter
leselisten_titles = leselisten_titles.query("GND.notna()")
leselisten_titles = leselisten_titles[~leselisten_titles['Autor'].str.startswith('$', na = False)]

In [186]:
leselisten_titles.head()

Unnamed: 0,Autor,GND,Geschlecht,Titel,Jahreszahl,Jahreszahl_Statistik,Gattung,Modul_Zeit_vor_17.Jh.,Modul_Sprache_(international),Modul_KJL,...,Salzburg,Stuttgart,Stuttgart 2022,Trier,Tübingen,Wien,Wuppertal,Würzburg,Würzburg_2019,Zürich
28,"Hage, Volker (Hg.)",https://d-nb.info/gnd/115807454,,Lyrik für Leser. Deutsche Gedichte der siebzig...,1981,1981.0,Lyrik,,,,...,,,,,,,,,,
52,"Marsch, Edgar (Hg.)",https://d-nb.info/gnd/13345052X,,Moderne deutsche Naturlyrik,1980,1980.0,Lyrik,,,,...,,,,,,,,,,
107,(Pfaffe Konrad),https://d-nb.info/gnd/118565060,m,Rolandslied,12./13. Jh.,1172.0,,MA,,,...,1.0,,,,,,,,,
108,"Abonji, Melinda Nadj",https://d-nb.info/gnd/129396532,w,Tauben fliegen auf,2010,2010.0,Prosa,,,,...,,,,,,,,,,
109,"Achleitner, Friedrich",https://d-nb.info/gnd/119369125,m,Quadratroman,1973,1973.0,Prosa,,,,...,,,,,,,,,,


In [187]:
leselisten_titles['Autor'] = [x.rstrip() if pd.notna(x) else x for x in leselisten_titles['Autor']]
leselisten_titles['Jahreszahl_Statistik'] = leselisten_titles['Jahreszahl_Statistik'].astype(float)

In [188]:
leselisten_titles = leselisten_titles.rename(columns = {
    'Modul_Zeit_vor_17.Jh.' : 'Modul_Zeit',
    'Modul_Sprache_(international)' : 'Modul_Sprache',
    'Modul: KJL' : 'Modul_KJL',
})

In [189]:
leselisten_dict = {
    'Aachen' : 'Aachen1',
    'Berlin' : 'FU Berlin',
    'Innsbruck' : 'Innsbruck1',
    'Innsbruck 2023' : 'Innsbruck2',
    'Köln Fundamentum' : 'Köln',
    'LA Aachen' : 'Aachen2',
    'Stuttgart' : 'Stuttgart1',
    'Stuttgart 2022' : 'Stuttgart2',
    'Würzburg' : 'Würzburg1',
    'Würzburg_2019' : 'Würzburg2',
}

leselisten_titles = leselisten_titles.rename(columns=leselisten_dict)

In [190]:
exceptions = [
    'Autor', 'GND', 'Geschlecht', 'Titel', 'Jahreszahl', 'Jahreszahl_Statistik', 'Gattung',
    'Modul_Zeit', 'Modul_Sprache', 'Modul_KJL', 'Dekade', 'Jahrhundert', 'sum'
]
leselisten = [x for x in leselisten_titles.columns.tolist() if x not in exceptions]
leselisten_prefix = ['Leselisten_'+x for x in leselisten]

In [191]:
def convert_to_float(frac_str):
    if type(frac_str) == str:
        frac_str = re.sub(",", ".", frac_str)
    try:
        return float(frac_str)
    except ValueError:
        frac_str = re.sub("⁄", "/", frac_str)
        num, denom = frac_str.split('/')
        try:
            leading, num = num.split(' ')
            whole = float(leading)
        except ValueError:
            whole = 0
        frac = float(num) / float(denom)
        return whole - frac if whole < 0 else whole + frac

leselisten_titles[leselisten] = leselisten_titles[leselisten].applymap(convert_to_float)
leselisten_titles[leselisten] = leselisten_titles[leselisten].astype(float)

In [192]:
# alles über 0 auf 1 setzen, falls ungewichtet
if mode == 'ungewichtet':
    leselisten_titles[leselisten] = leselisten_titles[leselisten].notnull().astype('int')

# NaN mit 0 ersetzen
leselisten_titles[leselisten] = leselisten_titles[leselisten].fillna(0)

In [193]:
# mit mehreren Autor:innen pro Text umgehen
for multi_author_index in leselisten_titles.query("GND.str.contains('\+')").index:
  multi_author_series = leselisten_titles.loc[multi_author_index].copy()
  leselisten_titles = leselisten_titles.drop(multi_author_index)

  multi_author_authors = [x.strip() for x in multi_author_series['Autor'].split("/")]
  multi_author_GNDs = [x.strip() for x in multi_author_series['GND'].split("+")]

  for single_author, single_GND in zip(multi_author_authors, multi_author_GNDs):
    single_author_series = multi_author_series.copy()
    single_author_series['Autor'] = single_author
    single_author_series['GND'] = single_GND
    # single_author_series[leselisten] = single_author_series[leselisten]/len(multi_author_authors)
    leselisten_titles = pd.concat([leselisten_titles, pd.DataFrame(single_author_series).T])

In [194]:
leselisten_authors = pd.DataFrame()
leselisten_authors.index = leselisten_titles.drop_duplicates(subset='GND')['GND']
leselisten_authors['Leselisten_Autor'] = leselisten_titles.drop_duplicates(subset='GND')['Autor'].tolist()
leselisten_authors[leselisten_prefix] = leselisten_titles.groupby('GND')[leselisten].sum()

In [195]:
# Jeden Wert durch Gesamtzahl aller Titel in jeweiliger Leseiste teilen
# leselisten_authors.loc[:, leselisten_prefix] = leselisten_authors.loc[:, leselisten_prefix].apply(lambda x: x / x.sum())

In [196]:
leselisten_authors['Leselisten_Summe'] = leselisten_authors[leselisten_prefix].sum(axis=1)

In [197]:
leselisten_authors = leselisten_authors.sort_values(by = 'Leselisten_Autor')

In [198]:
this_test_links = [x for x in test_links if x in leselisten_authors.index]
leselisten_authors.loc[this_test_links][[
    'Leselisten_Autor',
    'Leselisten_Göttingen', 'Leselisten_Würzburg1',
    'Leselisten_Summe'
]]

Unnamed: 0_level_0,Leselisten_Autor,Leselisten_Göttingen,Leselisten_Würzburg1,Leselisten_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",1.0,2.0,78.389635
https://d-nb.info/gnd/118516906,"Büchner, Georg",2.0,3.0,94.503444
https://d-nb.info/gnd/118519859,"Celan, Paul",0.0,3.0,59.005877
https://d-nb.info/gnd/118523392,"Dahn, Felix",0.0,0.0,1.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",1.0,1.0,51.812558
https://d-nb.info/gnd/118536109,"Frisch, Max",1.0,2.0,63.029624
https://d-nb.info/gnd/118585916,"Musil, Robert",1.0,4.0,61.488971


In [199]:
leselisten_authors[leselisten_prefix].sum().head()

Leselisten_Aachen2      24.773429
Leselisten_Aachen1       61.42512
Leselisten_Augsburg         109.0
Leselisten_FU Berlin        260.0
Leselisten_Bochum            98.0
dtype: object

In [200]:
leselisten_authors.shape

(911, 44)

In [201]:
leselisten_authors.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/01_leselisten.csv")

### Import and Merge

In [202]:
leselisten_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/01_leselisten.csv", index_col = [0])

In [203]:
cols_to_join = leselisten_authors.columns.difference(data_authors.columns)
data_authors = data_authors.join(leselisten_authors[cols_to_join], how = 'outer')

In [204]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Leselisten_Autor',
    'Leselisten_Göttingen', 'Leselisten_Würzburg1',
    'Leselisten_Summe',
]]

Unnamed: 0_level_0,Leselisten_Autor,Leselisten_Göttingen,Leselisten_Würzburg1,Leselisten_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",1.0,2.0,78.389635
https://d-nb.info/gnd/118516906,"Büchner, Georg",2.0,3.0,94.503444
https://d-nb.info/gnd/118519859,"Celan, Paul",0.0,3.0,59.005877
https://d-nb.info/gnd/118523392,"Dahn, Felix",0.0,0.0,1.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",1.0,1.0,51.812558
https://d-nb.info/gnd/118536109,"Frisch, Max",1.0,2.0,63.029624
https://d-nb.info/gnd/118585916,"Musil, Robert",1.0,4.0,61.488971


# Schule

### create schule_authors from raw

In [None]:
schule_titles = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/raw/schule_raw.csv", sep=";")

In [None]:
schule_titles = schule_titles.query("GND.notna()")

In [None]:
schule_titles.head()

Unnamed: 0,ID,Bundesland,Nachname_original,Nachname,GND,Geschlecht,Titel,Erscheinungsjahr,Erscheinungsjahr_k,Lit.preis_Name,Lit.preis_Autor_in_Name,Jug.literatur
0,2508,Baden-Württemberg,Äsop,Äsop,https://d-nb.info/gnd/118647180,männlich,Fabeln,unbekannt,unbekannt,,,nein
1,2447,Sachsen-Anhalt,Äsop,Äsop,https://d-nb.info/gnd/118647180,männlich,Fabeln,unbekannt,unbekannt,,,nein
2,1778,Baden-Württemberg,Achebe,Achebe,https://d-nb.info/gnd/118646680,männlich,Okonkwo oder Das Alte stürzt,1959,1958,,,nein
3,297,Hessen,Ade,Ade,https://d-nb.info/gnd/138053669,weiblich,Toni Erdmann (Film),2016,2017,"(mehrere Filmpreise, siehe Wikipedia)",,ja
4,3,Hamburg,Ade,Ade,https://d-nb.info/gnd/138053669,weiblich,Toni Erdmann (Film),2016,2016,"(mehrere Filmpreise, siehe Wikipedia)",,ja


In [None]:
bundeslaender = schule_titles['Bundesland'].unique()
bundeslaender_prefix = ['Schule_'+x for x in bundeslaender]

In [None]:
# mit mehreren Autor:innen pro Text umgehen
for multi_author_index in schule_titles.query("GND.str.contains('\+')").index:
  multi_author_series = schule_titles.loc[multi_author_index].copy()
  schule_titles = schule_titles.drop(multi_author_index)

  multi_author_authors = [x.strip() for x in multi_author_series['Nachname'].split("/")]
  multi_author_GNDs = [x.strip() for x in multi_author_series['GND'].split("+")]

  for single_author, single_GND in zip(multi_author_authors, multi_author_GNDs):
    single_author_series = multi_author_series.copy()
    single_author_series['Nachname'] = single_author
    single_author_series['GND'] = single_GND
    schule_titles = pd.concat([schule_titles, pd.DataFrame(single_author_series).T])

In [None]:
schule_authors = pd.DataFrame()

for gnd_link in tqdm(schule_titles['GND'].unique()):
  meta_author = schule_titles.query("GND == @gnd_link")
  schule_authors.loc[gnd_link, 'Schule_Autor'] = meta_author['Nachname'].tolist()[0]
  for bundesland in bundeslaender:
    meta_author_bundesland = meta_author.query("Bundesland == @bundesland")
    schule_authors.loc[gnd_link, 'Schule_'+bundesland] = meta_author_bundesland.shape[0]

schule_authors = schule_authors.fillna(0)
schule_authors.index.name = 'GND'

  0%|          | 0/674 [00:00<?, ?it/s]

In [None]:
# Jeden Wert durch Gesamtzahl aller Titel in jeweiligem Bundesland teilen
# schule_authors.loc[:, bundeslaender_prefix] = schule_authors.loc[:, bundeslaender_prefix].apply(lambda x: x / x.sum())

In [None]:
schule_authors['Schule_Summe'] = schule_authors[bundeslaender_prefix].sum(axis=1)

In [None]:
schule_authors = schule_authors.sort_values(by = 'Schule_Autor')

In [None]:
schule_authors.query("Schule_Autor == 'Fried'")

Unnamed: 0_level_0,Schule_Autor,Schule_Baden-Württemberg,Schule_Sachsen-Anhalt,Schule_Hessen,Schule_Hamburg,Schule_Niedersachsen,Schule_Rheinland-Pfalz,Schule_Bayern,Schule_Sachsen,Schule_Saarland,Schule_Mecklenburg-Vorpommern,Schule_Bremen,Schule_Brandenburg,Schule_Nordrhein-Westfalen,Schule_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
https://d-nb.info/gnd/115688595,Fried,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
https://d-nb.info/gnd/118703145,Fried,1.0,2.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,8.0


In [None]:
this_test_links = [x for x in test_links if x in schule_authors.index]
schule_authors.loc[this_test_links][[
    'Schule_Autor',
    'Schule_Niedersachsen', 'Schule_Bayern',
    'Schule_Summe'
]]

Unnamed: 0_level_0,Schule_Autor,Schule_Niedersachsen,Schule_Bayern,Schule_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://d-nb.info/gnd/118505602,Bachmann,2.0,1.0,18.0
https://d-nb.info/gnd/118516906,Büchner,4.0,1.0,28.0
https://d-nb.info/gnd/118519859,Celan,1.0,1.0,5.0
https://d-nb.info/gnd/118527908,Dürrenmatt,5.0,6.0,33.0
https://d-nb.info/gnd/118536109,Frisch,6.0,3.0,28.0
https://d-nb.info/gnd/118585916,Musil,1.0,1.0,9.0


In [None]:
schule_authors.shape

(674, 15)

In [None]:
schule_authors.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/02_schule.csv")

### Import and Merge

In [205]:
schule_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/02_schule.csv", index_col = [0])

In [206]:
cols_to_join = schule_authors.columns.difference(data_authors.columns)
data_authors = data_authors.join(schule_authors[cols_to_join], how = 'outer')

In [207]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Schule_Autor',
    'Schule_Niedersachsen', 'Schule_Bayern',
    'Schule_Summe',
]]

Unnamed: 0_level_0,Schule_Autor,Schule_Niedersachsen,Schule_Bayern,Schule_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://d-nb.info/gnd/118505602,Bachmann,2.0,1.0,18.0
https://d-nb.info/gnd/118516906,Büchner,4.0,1.0,28.0
https://d-nb.info/gnd/118519859,Celan,1.0,1.0,5.0
https://d-nb.info/gnd/118523392,,,,
https://d-nb.info/gnd/118527908,Dürrenmatt,5.0,6.0,33.0
https://d-nb.info/gnd/118536109,Frisch,6.0,3.0,28.0
https://d-nb.info/gnd/118585916,Musil,1.0,1.0,9.0


In [208]:
data_authors.shape

(1236, 59)

# Killy

### Scrape

In [13]:
!pip3 install google_colab_selenium

Collecting google_colab_selenium
  Downloading google_colab_selenium-1.0.13-py3-none-any.whl (8.1 kB)
Collecting selenium (from google_colab_selenium)
  Downloading selenium-4.21.0-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium->google_colab_selenium)
  Downloading trio-0.25.1-py3-none-any.whl (467 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.7/467.7 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium->google_colab_selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium->google_colab_selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium->google_colab_selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from

In [14]:
import google_colab_selenium as gs
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

In [15]:
driver = gs.Chrome()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
# log in via SUB Göttingen
driver.get("https://dbis.ur.de/dbinfo/warpto.php?bib_id=subgo&color=4&titel_id=11553&url=http%3A%2F%2FHAN.SUB.UNI-GOETTINGEN.DE%2Fhan%2Fverfasser-datenbank%2F")

username = driver.find_element(By.ID, "plainuser")
username.send_keys("000721344262")

password = driver.find_element(By.ID, "password")
password.send_keys("karen1na")

password.send_keys(Keys.RETURN)

In [17]:
def get_gnd_link_from_vdbo_page (driver):
  gnd_link = float('NaN')

  if driver.find_elements(By.LINK_TEXT, 'GND'):
    gnd_link = driver.find_element(By.LINK_TEXT, 'GND').get_attribute('href')
    if 'http://' in gnd_link:
      gnd_link = re.sub('http://', 'https://', gnd_link)

  return gnd_link

In [18]:
def get_article_content_from_vdbo_page (driver):
  article_content = driver.find_element(By.ID, "text-container").text
  return article_content

In [19]:
def get_name_from_vdbo_page (driver, mode = 'simple'):
    author = float('NaN')

    if mode == 'simple':
      author = driver.find_element(By.TAG_NAME, "h1").text

    if mode == 'full':
      article_content = get_article_content_from_vdbo_page(driver)
      searchresults = re.findall("\\nNamen\\n(.*?)\\n", article_content)
      if len(searchresults) > 0:
          author = searchresults[0]

    return author

In [20]:
def get_source_from_vdbo_page (driver):
    source = float('NaN')

    article_content = get_article_content_from_vdbo_page(driver)

    if "Verfasserlexikon – Die deutsche Literatur des Mittelalters" in article_content:
      source = "Verfasserlexikon MA"
    elif "Verfasserlexikon – Deutscher Humanismus 1480-1520" in article_content:
        source = "Verfasserlexikon Humanismus 1480–1520"
    elif "Verfasserlexikon – Frühe Neuzeit in Deutschland 1520-1620" in article_content:
      source = "Verfasserlexikon FNZ 1520–1620"
    elif "Verfasserlexikon – Frühe Neuzeit in Deutschland 1620-1720" in article_content:
        source = "Verfasserlexikon FNZ 1520–1620"
    elif "Killy Literaturlexikon – Autoren und Werke" in article_content:
      source = "Killy"

    bandnummer = float('NaN')
    bandnummer_a = re.findall("\nBand\n\\d+", article_content)
    bandnummer_b = re.findall("\nBand \\d+", article_content)
    if len(bandnummer_a) > 0:
      bandnummer = bandnummer_a[0].split("\n")[-1]
    if len(bandnummer_b) > 0:
      bandnummer = bandnummer_b[0].split(" ")[-1]

    if pd.notna(source) and pd.notna(bandnummer):
      source = source + ", Bd. " + bandnummer

    return source

In [29]:
killy = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/06_killy.csv", index_col = [0])
# killy = pd.DataFrame()

In [42]:
for i in tqdm(range(1, 7650)): # max: 7645
    killy_id = "{:0>{}}".format(i, 4)
    if 'Killy_id' in killy.columns.tolist() and 'killy_'+killy_id in killy['Killy_id'].tolist():
        continue

    driver.get(f"https://www-1degruyter-1com-1gzs214jk03c1.han.sub.uni-goettingen.de/database/VDBO/entry/vdbo.killy.{killy_id}/html")

    body_text = driver.find_element(By.TAG_NAME, 'body').text
    if "Your access to the De Gruyter site has been temporarily blocked" in body_text:
        print("Access blocked")
        continue

    if "nicht finden, die Sie aufgerufen haben" in body_text or "could not find the page" in body_text:
        print(f"Seite nicht gefunden (Killy-ID: {killy_id})")
        continue

    time.sleep(random.randint(3, 5))

    gnd_link = get_gnd_link_from_vdbo_page(driver)
    if pd.isna(gnd_link):
        gnd_link = 'GND_Placeholder_Killy_' + killy_id
    article_content = get_article_content_from_vdbo_page(driver)
    article_len = 0 if article_content == '' else len(' '.join(article_content.split("\n")).split(" "))
    author_simple = get_name_from_vdbo_page(driver, mode = 'simple')
    author_full = get_name_from_vdbo_page(driver, mode = 'full')
    source = get_source_from_vdbo_page(driver)

    # killy.at[gnd_link, 'Killy_id'] = 'killy_' + killy_id
    # killy.at[gnd_link, 'Killy_simple_Autor'] = author_simple
    # killy.at[gnd_link, 'Killy_full_Autor'] = author_full
    # killy.at[gnd_link, 'Killy_article'] = article_content
    # killy.at[gnd_link, 'Killy_length_Summe'] = article_len
    # killy.at[gnd_link, 'VDBO_source'] = source

    killy_add = pd.DataFrame({
      'Killy_id' : 'killy_' + killy_id,
      'Killy_simple_Autor' : author_simple,
      'Killy_full_Autor' : author_full,
      'Killy_article' : article_content,
      'Killy_length_Summe' : article_len,
      'VDBO_source' : source},
      index = [gnd_link])
    killy = pd.concat([killy, killy_add])

    print(f"{gnd_link:<34} {author_simple} ({article_len} Wörter Killy) ({source})")

  0%|          | 0/7649 [00:00<?, ?it/s]

https://d-nb.info/gnd/116720964    Hensel, Sophie Friederike (613 Wörter Killy) (Killy, Bd. 5)
Seite nicht gefunden (Killy-ID: 7645)
Seite nicht gefunden (Killy-ID: 7646)
Seite nicht gefunden (Killy-ID: 7647)
Seite nicht gefunden (Killy-ID: 7648)
Seite nicht gefunden (Killy-ID: 7649)


In [141]:
killy_gnd_links_correction = {
    # killy/wrong                         # new/correct
    'https://d-nb.info/gnd/1237852404' : 'https://d-nb.info/gnd/118502255', # Altenberg, Peter
    'https://d-nb.info/gnd/1243925175' : 'https://d-nb.info/gnd/118502786', # Andersch, Alfred (2x gleich?)
    'https://d-nb.info/gnd/1153581523' : 'https://d-nb.info/gnd/118507397', # Bauer, Wolfgang
    'https://d-nb.info/gnd/1099922879' : 'https://d-nb.info/gnd/118509861', # Bernhard, Thomas
    'https://d-nb.info/gnd/1174822937' : 'https://d-nb.info/gnd/11548048X', # Beyer, Marcel
    'https://d-nb.info/gnd/1208457071' : 'https://d-nb.info/gnd/118510649', # Bichsel, Peter
    'https://d-nb.info/gnd/1095799150' : 'https://d-nb.info/gnd/118513435', # Borchardt, Rudolf
    'https://d-nb.info/gnd/133726185'  : 'https://d-nb.info/gnd/118514512', # Brasch, Thomas
    'https://d-nb.info/gnd/1138454273' : 'https://d-nb.info/gnd/118514644', # Braun, Volker
    'https://d-nb.info/gnd/138268665'  : 'https://d-nb.info/gnd/11851587X', # Brückner, Christine
    'https://d-nb.info/gnd/133328562'  : 'https://d-nb.info/gnd/118516906', # Büchner, Georg
    'https://d-nb.info/gnd/7512202-9'  : 'https://d-nb.info/gnd/118516477', # Buber, Martin
    'https://d-nb.info/gnd/1072431262' : 'https://d-nb.info/gnd/118520512', # Chodowiecki, Daniel
    'https://d-nb.info/gnd/1096198584' : 'https://d-nb.info/gnd/118523392', # Dahn, Felix
    'https://d-nb.info/gnd/12155645X'  : 'https://d-nb.info/gnd/121550656', # Damm, Sigrid
    'https://d-nb.info/gnd/1027187870' : 'https://d-nb.info/gnd/119286289', # Elsner, Gisela
    'https://d-nb.info/gnd/174023464'  : 'https://d-nb.info/gnd/118530259', # Ende, Michael
    'https://d-nb.info/gnd/1037518098' : 'https://d-nb.info/gnd/118534793', # Frank, Leonhard
    'https://d-nb.info/gnd/121309037'  : 'https://d-nb.info/gnd/118535455', # Freytag, Gustav
    'https://d-nb.info/gnd/1013841387' : 'https://d-nb.info/gnd/119523604', # Funke, Cornelia
    'https://d-nb.info/gnd/1067158162' : 'https://d-nb.info/gnd/115612815', # Geiger, Arno
    'https://d-nb.info/gnd/1173437932' : 'https://d-nb.info/gnd/118538659', # Gerhardt, Paul
    'https://d-nb.info/gnd/1073358569' : 'https://d-nb.info/gnd/118539604', # Glaeser, Ernst
    'https://d-nb.info/gnd/134391314'  : 'https://d-nb.info/gnd/118542265', # Grimm, Wilhelm
    'https://d-nb.info/gnd/1159367094' : 'https://d-nb.info/gnd/118179527', # Haas, Wolf
    'https://d-nb.info/gnd/124939031'  : 'https://d-nb.info/gnd/118544330', # Hacks, Peter
    'https://d-nb.info/gnd/134840070'  : 'https://d-nb.info/gnd/118701606', # Harig, Ludwig
    'https://d-nb.info/gnd/1214200532' : 'https://d-nb.info/gnd/119549867', # Haslinger, Josef
    'https://d-nb.info/gnd/188380329'  : 'https://d-nb.info/gnd/118840991', # Hein, Christoph
    'https://d-nb.info/gnd/1176824139' : 'https://d-nb.info/gnd/118881280', # Hessel, Franz
    'https://d-nb.info/gnd/7505054-7'  : 'https://d-nb.info/gnd/118706462', # Holl, Elias
    'https://d-nb.info/gnd/1153638819' : 'https://d-nb.info/gnd/119535467', # Hoppe, Felicitas
    'https://d-nb.info/gnd/1205166688' : 'https://d-nb.info/gnd/118557211', # Jean Paul (2x gleich?)
    'https://d-nb.info/gnd/124392585X' : 'https://d-nb.info/gnd/118776592', # Kaleko, Mascha (2x gleich?)
    'https://d-nb.info/gnd/115583920'  : 'https://d-nb.info/gnd/11856109X', # Keller, Gottfried
    'https://d-nb.info/gnd/1243920904' : 'https://d-nb.info/gnd/118561359', # Kempowski, Walter
    'https://d-nb.info/gnd/1230830219' : 'https://d-nb.info/gnd/118562487', # Kirsch, Sarah
    'https://d-nb.info/gnd/7513003-8'  : 'https://d-nb.info/gnd/118562827', # Klee, Paul
    'https://d-nb.info/gnd/1243890681' : 'https://d-nb.info/gnd/118563491', # Kluge, Alexander
    'https://d-nb.info/gnd/1230518142' : 'https://d-nb.info/gnd/118958836', # Königsdorf, Helga
    'https://d-nb.info/gnd/141148470'  : 'https://d-nb.info/gnd/118715917', # Kretzer, Max
    'https://d-nb.info/gnd/1130190293' : 'https://d-nb.info/gnd/11871595X', # Kreuder, Ernst
    'https://d-nb.info/gnd/1146891407' : 'https://d-nb.info/gnd/11899011X', # Kühn, Dieter
    'https://d-nb.info/gnd/1067155724' : 'https://d-nb.info/gnd/118568051', # Kunert, Günter
    'https://d-nb.info/gnd/1243928654' : 'https://d-nb.info/gnd/118568124', # Kunze, Reiner (2x gleich?)
    'https://d-nb.info/gnd/1243922230' : 'https://d-nb.info/gnd/118570285', # Lavant, Christine (2x gleich?)
    'https://d-nb.info/gnd/1140251791' : 'https://d-nb.info/gnd/118571095', # Lehmann, Wilhelm
    'https://d-nb.info/gnd/1024949737' : 'https://d-nb.info/gnd/119560526', # Lehr, Thomas
    'https://d-nb.info/gnd/1047511703' : 'https://d-nb.info/gnd/118832891', # Lichtenstein, Alfred
    'https://d-nb.info/gnd/17408739X'  : 'https://d-nb.info/gnd/122418980', # Maier, Andreas
    'https://d-nb.info/gnd/110222992X' : 'https://d-nb.info/gnd/118578251', # Marti, Kurt
    'https://d-nb.info/gnd/1244171158' : 'https://d-nb.info/gnd/118577158', # Mann, Klaus
    'https://d-nb.info/gnd/1078020566' : 'https://d-nb.info/gnd/118818651', # May, Karl
    'https://d-nb.info/gnd/1219085529' : 'https://d-nb.info/gnd/120020513', # Meinecke, Thomas
    'https://d-nb.info/gnd/14296378X'  : 'https://d-nb.info/gnd/118031198', # Merz, Klaus
    'https://d-nb.info/gnd/1200484592' : 'https://d-nb.info/gnd/11858314X', # Möser, Justus
    'https://d-nb.info/gnd/1025857100' : 'https://d-nb.info/gnd/118585193', # Müller, Robert
    'https://d-nb.info/gnd/117698005X' : 'https://d-nb.info/gnd/118585916', # Musil, Robert
    'https://d-nb.info/gnd/1073434419' : 'https://d-nb.info/gnd/118587331', # Neumann, Robert
    'https://d-nb.info/gnd/130331511'  : 'https://d-nb.info/gnd/118587668', # Nicolai, Friedrich
    'https://d-nb.info/gnd/1043508473' : 'https://d-nb.info/gnd/118590111', # Opitz, Martin
    'https://d-nb.info/gnd/1243920939' : 'https://d-nb.info/gnd/119330490', # Recheis, Käthe (2x gleich?)
    'https://d-nb.info/gnd/1184512086' : 'https://d-nb.info/gnd/118744690', # Reuter, Christian
    'https://d-nb.info/gnd/1050321308' : 'https://d-nb.info/gnd/118599976', # Reuter, Fritz
    'https://d-nb.info/gnd/1144967775' : 'https://d-nb.info/gnd/122400259', # Richter, Julia
    'https://d-nb.info/gnd/1147495351' : 'https://d-nb.info/gnd/118602667', # Rosegger, Peter
    'https://d-nb.info/gnd/7512201-7'  : 'https://d-nb.info/gnd/118602802', # Rosenzweig, Franz
    'https://d-nb.info/gnd/1080243968' : 'https://d-nb.info/gnd/119202824', # Roth, Friederike
    'https://d-nb.info/gnd/116638249'  : 'https://d-nb.info/gnd/118603140', # Roth, Joseph
    'https://d-nb.info/gnd/1139921320' : 'https://d-nb.info/gnd/118603817', # Rückert, Friedrich
    'https://d-nb.info/gnd/173627587'  : 'https://d-nb.info/gnd/118604597', # Sachs, Hans
    'https://d-nb.info/gnd/7502612-0'  : 'https://d-nb.info/gnd/118607782', # Schinkel, Karl Friedrich
    'https://d-nb.info/gnd/1187369802' : 'https://d-nb.info/gnd/11875968X', # Schmidt, Julian
    'https://d-nb.info/gnd/1055490981' : 'https://d-nb.info/gnd/11860922X', # Schmitt, Carl
    'https://d-nb.info/gnd/173845614'  : 'https://d-nb.info/gnd/119279487', # Schneider, Robert
    'https://d-nb.info/gnd/1022708112' : 'https://d-nb.info/gnd/119013517', # Schütz, Stefan
    'https://d-nb.info/gnd/7508478-8'  : 'https://d-nb.info/gnd/118613154', # Semper, Gottfried
    'https://d-nb.info/gnd/124389170X' : 'https://d-nb.info/gnd/118614444', # Simmel, Mario (2x gleich?)
    'https://d-nb.info/gnd/120990415'  : 'https://d-nb.info/gnd/119228408', # Steiner, Jörg
    'https://d-nb.info/gnd/1055487859' : 'https://d-nb.info/gnd/10127226X', # Heinrich von Sax
    'https://d-nb.info/gnd/1124708286' : 'https://d-nb.info/gnd/118758675', # Schneider, Peter
    'https://d-nb.info/gnd/1146806442' : 'https://d-nb.info/gnd/118625063', # Uhland, Ludwig
    'https://d-nb.info/gnd/17014142X'  : 'https://d-nb.info/gnd/118768395', # Viertel, Berthold
    'https://d-nb.info/gnd/124428800'  : 'https://d-nb.info/gnd/118594117', # Wagner, Richard
    'https://d-nb.info/gnd/131977474'  : 'https://d-nb.info/gnd/118628852', # Walser, Martin
    'https://d-nb.info/gnd/135327369'  : 'https://d-nb.info/gnd/118629867', # Wedekind, Frank .
    'https://d-nb.info/gnd/1235001695' : 'https://d-nb.info/gnd/118630369', # Weise, Christian
    'https://d-nb.info/gnd/1102928240' : 'https://d-nb.info/gnd/118630539', # Weiss, Peter
    'https://d-nb.info/gnd/1020534532' : 'https://d-nb.info/gnd/120295245', # Werner, Markus
    'https://d-nb.info/gnd/1235988406' : 'https://d-nb.info/gnd/118632477', # Wieland, Christoph Martin (2x gleich?)
    'https://d-nb.info/gnd/117404144'  : 'https://d-nb.info/gnd/118881140', # Winkler, Josef
    'https://d-nb.info/gnd/128640812'  : 'https://d-nb.info/gnd/118634666', # Wolf, Christa
    'https://d-nb.info/gnd/189566132'  : 'https://d-nb.info/gnd/11811946X', # Zimmer, Dieter
    'https://d-nb.info/gnd/132022958'  : 'https://d-nb.info/gnd/118773186', # Zorn, Fritz
}
killy = killy.rename(index=killy_gnd_links_correction)

In [142]:
killy = killy.sort_values(by='Killy_id')
killy.index.name = 'GND'

In [143]:
killy.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/06_killy.csv")

### Import and Merge

In [209]:
killy = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/06_killy.csv", index_col = [0])

In [210]:
killy_join = killy.copy()

In [211]:
# Seyler, Friederike Sophie + Hensel, Sophie Friederike = gleiche Person
seyler_hensel = killy_join.loc['https://d-nb.info/gnd/116720964']
if seyler_hensel.shape[0] == 2:
  seyler_hensel_unified = seyler_hensel.iloc[0].copy()
  seyler_hensel_unified['Killy_length_Summe'] = seyler_hensel['Killy_length_Summe'].sum()
  seyler_hensel_unified['Killy_simple_Autor'] = ' + '.join(seyler_hensel['Killy_simple_Autor'])
  seyler_hensel_unified['Killy_full_Autor'] = ' + '.join(seyler_hensel['Killy_full_Autor'])

  killy_join = killy_join.drop('https://d-nb.info/gnd/116720964')
  killy_join = pd.concat([killy_join, pd.DataFrame(seyler_hensel_unified).T])

In [212]:
killy_join = killy_join.loc[[x for x in killy_join.index if pd.notna(x) and 'd-nb' in x]].copy()
killy_join = killy_join.drop("Killy_article", axis = 'columns')

In [213]:
cols_to_join = killy_join.columns.difference(data_authors.columns)
data_authors = data_authors.join(killy_join[cols_to_join], how = 'outer')

In [214]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Killy_simple_Autor',
    'Killy_length_Summe',
]]

Unnamed: 0,Killy_simple_Autor,Killy_length_Summe
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",2472.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",3260.0
https://d-nb.info/gnd/118519859,"Celan, Paul",2998.0
https://d-nb.info/gnd/118523392,"Dahn, Felix",953.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",4299.0
https://d-nb.info/gnd/118536109,"Frisch, Max",3088.0
https://d-nb.info/gnd/118585916,"Musil, Robert",3571.0


# GND

### Scrape

In [284]:
import requests
from bs4 import BeautifulSoup
import re

In [285]:
def get_response_from_gnd (gnd_url):
    return requests.get(gnd_url)

In [286]:
# def get_name_from_gnd_response (response):
#     preferred_name_start = response.text.find('gndo:preferredNameEntityForThePerson')
#     preferred_name_slice = response.text[preferred_name_start:preferred_name_start+300]
#
#     forename = ''
#     surname = ''
#
#     forename_start = preferred_name_slice.find("gndo:forename")
#     if forename_start != -1:
#       forename_slice = preferred_name_slice[(forename_start + 15):(forename_start + 100)]
#       forename = re.findall(""".+?(?=\")""", forename_slice)[0]
#
#     nameaddition_start = preferred_name_slice.find("gndo:nameAddition")
#     if nameaddition_start != -1:
#       nameaddition_slice = preferred_name_slice[(nameaddition_start + 19):(nameaddition_start + 100)]
#       nameaddition = re.findall(""".+?(?=\")""", nameaddition_slice)[0]
#       forename = forename + ' ' + nameaddition
#
#     prefix_start = preferred_name_slice.find("gndo:prefix")
#     if prefix_start != -1:
#       prefix_slice = preferred_name_slice[(prefix_start + 13):(prefix_start + 100)]
#       prefix = re.findall(""".+?(?=\")""", prefix_slice)[0]
#       forename = forename + ' ' + prefix
#
#     surname_start = preferred_name_slice.find("gndo:surname")
#     if surname_start != -1:
#       surname_slice = preferred_name_slice[(surname_start + 14):(surname_start + 100)]
#       surname = re.findall(""".+?(?=\")""", surname_slice)[0]
#
#     personalname_start = preferred_name_slice.find("gndo:personalName")
#     if personalname_start != -1:
#       personalname_slice = preferred_name_slice[(personalname_start + 19):(personalname_start + 100)]
#       personalname = re.findall(""".+?(?=\")""", personalname_slice)[0]
#       surname = personalname
#
#     return [forename, surname]

def get_name_from_gnd_response (response):
  name = float('nan')

  soup = BeautifulSoup(response.text, "html.parser")
  td_elements = soup.findAll('td')

  for i, td_element in enumerate(td_elements):
      if td_element.strong != None and td_element.strong.string == 'Person':
          name = td_elements[i+1].string.strip()

  return name

In [287]:
def get_gender_from_gnd_response (response):
    gender = float('nan')

    gender_pos_start = response.text.find('vocab/gnd/gender#')
    gender_content = response.text[gender_pos_start+17:gender_pos_start+25]

    if 'female' in gender_content:
      gender = 'female'
    elif 'male' in gender_content:
      gender = 'male'
    elif 'notKnown' in gender_content:
      gender = 'notKnown'
    else:
      gender = gender_content

    return gender

# def get_gender_from_gnd_response (response):
#   gender = float('nan')
#
#   soup = BeautifulSoup(response.text, "html.parser")
#   td_elements = soup.findAll('td')
#
#   for i, td_element in enumerate(td_elements):
#       if td_element.strong != None and td_element.strong.string == 'Geschlecht':
#           gender = td_elements[i+1].string.strip()
#
#   return gender

In [288]:
def get_lifetime_from_gnd_response (response):
    lifetime = [float('nan'), float('nan')]

    birth_pos = response.text.find('gndo:dateOfBirth ')
    if birth_pos != -1:
        birth_content = response.text[birth_pos+18:birth_pos+22]
        if birth_content.isnumeric():
            lifetime[0] = int(birth_content)

    death_pos = response.text.find('gndo:dateOfDeath ')
    if death_pos != -1:
        death_content = response.text[death_pos+18:death_pos+22]
        if death_content.isnumeric():
            lifetime[1] = int(death_content)

    return lifetime

# def get_lifetime_from_gnd_response (response):
#   lifetime = float('NaN')
#
#   soup = BeautifulSoup(response.text, "html.parser")
#   td_elements = soup.findAll('td')
#
#   for i, td_element in enumerate(td_elements):
#       if td_element.strong != None and td_element.strong.string == 'Zeit':
#           lifetime = td_elements[i+1].string.strip()
#           lifetime = [int(x) for x in re.findall(r'\d+', lifetime)]
#           if len(lifetime) >= 2:
#             birth = lifetime[0]
#             for year in lifetime[1:]:
#               if year > lifetime[0]:
#                   death = year
#                   break
#             lifetime = [birth, death]
#
#   return lifetime

In [289]:
def get_countries_from_gnd_response (response):
    countries = float('nan')

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Land':
            countries = td_elements[i+1].string.strip()
            countries = countries.split('; ')

    return countries

In [290]:
def get_occupations_from_gnd_response (response):
    occupations = float('nan')

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Beruf(e)':
            occupation_links = td_elements[i+1].findAll('a')
            occupations = [x.string for x in occupation_links]

            if len(occupations) == 0: # gibt Rubrik Beruf(e), aber dort keine Links, sondern plain text
                occupations = [td_elements[i+1].string.strip()]

    return occupations

In [291]:
# data_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv", index_col = [0])

gnd = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv", index_col = [0])
# gnd = pd.DataFrame()

links_to_scrape = [x for x in data_authors.index if x not in gnd.index and 'd-nb' in x]
# links_to_scrape = data_authors.sample(n=50).index
# links_to_scrape = test_links

In [292]:
for i, gnd_link in enumerate(tqdm(links_to_scrape)):
  if i%20 == 0:
    time.sleep(random.randint(8, 12))

  response_a = get_response_from_gnd(gnd_link)
  response_b = get_response_from_gnd(gnd_link + '/about/lds')

  # try:
  #    forename, surname = get_name_from_gnd_response(response_b)
  #    if surname == '':
  #      print(f"{gnd_link} Fehler (Name)")
  #      author_name = float('NaN')
  #    elif forename != '':
  #      author_name = surname + ', ' + forename
  #    else:
  #      author_name = surname
  #    print(f"{gnd_link} {author_name}")
  #    gnd.at[gnd_link, 'GND_Autor'] = author_name
  #  except:
  #    print(f"{gnd_link} (Fehler: Name)")

  try:
    gnd.at[gnd_link, 'GND_Autor'] = get_name_from_gnd_response(response_a)
    print(f"{gnd_link} {gnd.at[gnd_link, 'GND_Autor']}")
  except:
    print(f"{gnd_link} (Fehler: Autor)")

  try:
    gnd.at[gnd_link, 'GND_Gender'] = get_gender_from_gnd_response(response_b)
  except:
    print(f"{gnd_link} (Fehler: Gender)")

  try:
    gnd.at[gnd_link, 'GND_Geburtsjahr'] = get_lifetime_from_gnd_response(response_b)[0]
    gnd.at[gnd_link, 'GND_Sterbejahr'] = get_lifetime_from_gnd_response(response_b)[1]
  except:
    print(f"{gnd_link} (Fehler: Geburtsjahr/Sterbejahr)")

  try:
    gnd.at[gnd_link, 'GND_Laender'] = ' + '.join(sorted(get_countries_from_gnd_response(response_a)))
  except:
    print(f"{gnd_link} (Fehler: Länder)")

  try:
    gnd.at[gnd_link, 'GND_Berufe'] = ' + '.join(sorted(get_occupations_from_gnd_response(response_a)))
  except:
    print(f"{gnd_link} (Fehler: Berufe)")

  0%|          | 0/1 [00:00<?, ?it/s]

https://d-nb.info/gnd/118520512 Chodowiecki, Daniel


In [293]:
gnd.index.name = 'GND'
gnd = gnd.sort_values(by = 'GND_Autor')

gnd['GND_Autor'] = [re.sub('ä', 'ä', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('á', 'á', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('Č', 'Č', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ć', 'ć', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('é', 'é', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ë', 'ë', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('Ō', 'Ō', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('Ö', 'Ö', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ö', 'ö', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ó', 'ó', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ō', 'ō', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('š', 'š', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ü', 'ü', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ž', 'z', x) if pd.notna(x) else x for x in gnd['GND_Autor']]

In [294]:
laender = [str(x).split(" + ") for x in gnd['GND_Laender']]
laender = [item for sublist in laender for item in sublist]
print(pd.Series(laender).value_counts())

laender_deutsch = ['Deutschland', 'Österreich', 'Schweiz']
GND_deutsch = []
for x in gnd['GND_Laender']:
  if pd.isna(x):
    GND_deutsch.append(x)
  elif any(land in str(x) for land in laender_deutsch):
    GND_deutsch.append(True)
  else:
    GND_deutsch.append(False)
gnd['GND_deutsch'] = GND_deutsch

Deutschland (XA-DE)      6261
Österreich (XA-AT)       1045
Schweiz (XA-CH)           728
Frankreich (XA-FR)        316
USA (XD-US)               296
                         ... 
Rwanda (XC-RW)              1
Paraguay (XD-PY)            1
Thailand (XB-TH)            1
Saudi-Arabien (XB-SA)       1
Jamaika (XD-JM)             1
Name: count, Length: 101, dtype: int64


In [295]:
berufe = [str(x).split(" + ") for x in gnd['GND_Berufe']]
berufe = [item for sublist in berufe for item in sublist]
print(pd.Series(berufe).value_counts())

berufe_schriftsteller = ['Schriftsteller', 'Lyriker', 'Dramatiker', 'Librettist']
GND_Schriftsteller = []
for x in gnd['GND_Berufe']:
  if pd.isna(x):
    GND_Schriftsteller.append(x)
  elif any(beruf in str(x) for beruf in berufe_schriftsteller):
    GND_Schriftsteller.append(True)
  else:
    GND_Schriftsteller.append(False)
gnd['GND_Schriftsteller'] = GND_Schriftsteller

Schriftsteller                  5321
Schriftstellerin                 958
Dramatiker                       918
Übersetzer                       640
Lyriker                          634
                                ... 
Betriebswirtin                     1
Kürschner                          1
Polsterer                          1
Kinder- und Jugendpsychiater       1
Theaterkritikerin                  1
Name: count, Length: 997, dtype: int64


In [296]:
gnd.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv")

### Import and Merge

In [215]:
gnd = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv", index_col = [0])

In [216]:
cols_to_join = gnd.columns.difference(data_authors.columns)
data_authors = data_authors.join(gnd[cols_to_join], how = 'left')

In [217]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'GND_Autor', 'GND_Gender', 'GND_Geburtsjahr', 'GND_Sterbejahr',
    'GND_Laender', 'GND_Berufe', 'GND_deutsch'
]]

Unnamed: 0,GND_Autor,GND_Gender,GND_Geburtsjahr,GND_Sterbejahr,GND_Laender,GND_Berufe,GND_deutsch
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",female,1926.0,1973.0,Italien (XA-IT) + Österreich (XA-AT),Librettistin + Musikerin + Schriftstellerin,True
https://d-nb.info/gnd/118516906,"Büchner, Georg",male,1813.0,1837.0,Deutschland (XA-DE) + Frankreich (XA-FR) + Sch...,Arzt + Dramatiker + Dramatiker + Schriftstelle...,True
https://d-nb.info/gnd/118519859,"Celan, Paul",male,1920.0,1970.0,Frankreich (XA-FR) + Jüdischer Kulturkreis (Re...,Lektor + Lyriker + Schriftsteller + Übersetzer,True
https://d-nb.info/gnd/118523392,"Dahn, Felix",male,1834.0,1912.0,Deutschland (XA-DE),Dramatiker + Erzähler + Historiker + Jurist + ...,True
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",male,1921.0,1990.0,Deutschland (XA-DE) + Schweiz (XA-CH),Dramatiker + Dramaturg + Grafiker + Librettist...,True
https://d-nb.info/gnd/118536109,"Frisch, Max",male,1911.0,1991.0,Deutschland (XA-DE) + Italien (XA-IT) + Schwei...,Architekt + Dramatiker + Drehbuchautor + Journ...,True
https://d-nb.info/gnd/118585916,"Musil, Robert",male,1880.0,1942.0,Deutschland (XA-DE) + Italien (XA-IT) + Schwei...,Herausgeber + Kritiker + Redakteur + Reserveof...,True


In [218]:
# Gibt es Autor:innen mit unterschiedlichen GND-Links, die den gleichen/ähnlichen GND-Namen haben?
pd.set_option('display.width', 1000)
multi_authors = data_authors['GND_Autor'].value_counts()[data_authors['GND_Autor'].value_counts() > 1].index
allowed = [
    'Mayer, Johann Friedrich', 'Müller, Heinrich',
    'Pistorius, Johann', 'Camerius, Joachim', 'Sommer, Johannes',
    'Forster, Georg', 'Füssli, Johann Heinrich', 'Albinus, JOhann Georg',
    'Hildebrandt, Dieter', 'Praetorius, Johannes', 'Clajus, Johannes',
    'Spanheim, Friedrich', 'Hamann, Johann Georg', 'Gwalther, Rudolf',
    'Gwalther, Rudolf', 'Fürer von Haimendorf, Christoph',
    'Camerarius, Joachim', 'Hermann, Wolfgang', 'Ruland, Martin',
    'Frey, Jacob', 'Fabricius, Johann', 'Schneider, Michael',
    'Albinus, Johann Georg', 'Wagner, Richard', 'Beer, Johann Christoph'
]
multi_authors = [x for x in multi_authors if x not in allowed]
for multi_author in multi_authors:
  multi_author_df = data_authors.query("GND_Autor == @multi_author")
  multi_author_df = multi_author_df[['GND_Autor', 'Leselisten_Autor', 'Schule_Autor', 'Killy_simple_Autor']]
  print(multi_author_df)
  print("\n")

In [219]:
von_authors_to_standard_authors_keys = [x for x in data_authors['GND_Autor'] if ', von' in x]
von_authors_to_standard_authors_values = [re.sub(", von", " von", x) for x in von_authors_to_standard_authors_keys]
von_authors_to_standard_authors = dict(zip(von_authors_to_standard_authors_keys, von_authors_to_standard_authors_values))
gnd_authors_to_standard_authors = gnd_authors_to_standard_authors | von_authors_to_standard_authors

# Wikipedia

### Scrape

In [302]:
!pip install pywikibot



In [303]:
# Code (leicht angepasst) nach: https://github.com/temporal-communities/wiki-metrix

# Illmer, V. J., Soethaert, B., Welz, L., Fischer, F., & Jäschke, R. (2024, Februar 21).
# Literatur im Wikiversum – Eine praktische Annäherung über API-Abfragen und Wikipedia-Metriken.
# DHd 2024 Quo Vadis DH (DHd2024), Passau, Deutschland. https://doi.org/10.5281/zenodo.10698426

pywikibot_config = r"""# -*- coding: utf-8  -*-


mylang = 'de'
family = 'wikipedia'
usernames['wikipedia']['de'] = 'test'"""

with open('user-config.py', 'w', encoding="utf-8") as f:
    f.write(pywikibot_config)

import pywikibot
import requests
import datetime
import urllib.parse

def get_page_stats(page: pywikibot.Page):
    """
    Get page stats for a given page.
    """

    # Handle redirects
    # page = handle_redirect(page)

    page_content = page.get(force=True)
    length_in_bytes = len(page_content.encode("utf-8"))
    page_revisions = list(page.revisions(reverse=True))

    data = {
        "title": page.title(),
        "url": page.full_url(),
        "length": length_in_bytes,
        "n_contributors": len(page.contributors()),
        "n_revisions": len(page_revisions),
        "n_extlinks": len(list(page.extlinks())),
        "n_langlinks": len(page.langlinks()),
        "n_links": len(list(page.linkedPages())),
        "n_linkshere": len(
            list(page.linkedPages(namespaces=[0], follow_redirects=False))
        ),  # Article namespace only (0)
        "n_categories": len(list(page.categories())),
        "pageviews_365d": get_pageviews(page, days=365),
        "pageviews_730d": get_pageviews(page, days=730),
        "pageviews_1825d": get_pageviews(page, days=1825),
        "first_revision": page_revisions[0].timestamp,
    }

    MW_API_LIMIT = 500
    # Give warning if any value is at the limit
    for key, value in data.items():
        if value == MW_API_LIMIT:
            print(f"Warning: {key} at limit {MW_API_LIMIT}.")

    return data

# Use Wikimedia Pageviews REST API to get pageviews
def get_pageviews(page: pywikibot.Page, days=365):
    lang = page.site.code
    site = page.site.family.name

    # Wikimedia REST API
    # https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews
    # https://wikimedia.org/api/rest_v1/
    end_date = datetime.date(2024, 5, 1) # datetime.date.today() - datetime.timedelta(days=2)  # Two days ago
    start_date = end_date - datetime.timedelta(days=days)  # end_date minus [days] ago

    agent_type = "user"  # user, bot, spider, all-agents
    title_uri = urllib.parse.quote(
        page.title(underscore=True, with_section=False), safe=""
    )  # URI-encoded title, no safe characters
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{lang}.{site}/all-access/{agent_type}/{title_uri}/monthly/{start_date.strftime('%Y%m%d')}/{end_date.strftime('%Y%m%d')}"

    user_agent = f"wiki-metrix (https://github.com/temporal-communities/wiki-metrix) requests/{requests.__version__}"
    response = requests.get(url, headers={"User-Agent": user_agent})

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code} {response.reason}")

    data = response.json()
    pageviews_sum = sum(filter(None, [item["views"] for item in data["items"]]))

    return pageviews_sum

def add_page_stats_to_df (df, wikipedia_article_column = 'Wikipedia_Artikel'):
    site = pywikibot.Site('de', 'wikipedia')  # The site we want to run our bot on
    wikiresults_joined_df = pd.DataFrame()

    for article in tqdm(df[wikipedia_article_column]):
        page = pywikibot.Page(site, article)
        wikiresults = get_page_stats(page)

        wikiresults_df = pd.DataFrame(pd.Series(wikiresults)).T
        wikiresults_joined_df = pd.concat([wikiresults_joined_df, wikiresults_df])

    wikiresults_joined_df = wikiresults_joined_df.reset_index(drop=True)
    df = df.join(wikiresults_joined_df)

    return df

In [304]:
site = pywikibot.Site('de', 'wikipedia')

In [305]:
gnd_authors_to_wiki_authors = {
    'Alciato, Andreas': 'Andrea Alciato',
    'Bachtin, Michail Michajlovič' : 'Michail Michailowitsch Bachtin',
    'Blankenburg, Christian Friedrich von': 'Christian Friedrich von Blanckenburg',
    'Cao, Xueqin': 'Cao Xueqin',
    'De Cesco, Federica': 'Federica de Cesco',
    'De Man, Paul': 'Paul de Man',
    'Fiedler, Leslie A.': 'Leslie Fiedler',
    'Gao, Xingjian': 'Gao Xingjian',
    'Gliḳl bas Judah Leib': 'Glikl bas Judah Leib',
    "Gogolʹ, Nikolaj Vasilʹevič": 'Nikolai Wassiljewitsch Gogol',
    'Iolande, von Vianden': 'Yolanda von Vianden',
    'Kittler, Friedrich A.': 'Friedrich Kittler',
    'Lao, She': 'Lao She',
    'Maḥfūẓ, Naǧīb': 'Nagib Mahfuz',
    'Nabokov, Vladimir Vladimirovič': 'Vladimir Nabokov',
    "O'Dell, Scott": 'Scott O’Dell',
    "Orléans, Elisabeth Charlotte d'": "Élisabeth Charlotte d’Orléans",
    "Solženicyn, Aleksandr Isaevič": "Alexander Issajewitsch Solschenizyn",
    "Strugackij, Boris": "Arkadi und Boris Strugazki",
    "Šerstjanoj, Valerij": "Valeri Scherstjanoi",
    "Werner, der Gärtner": "Wernher der Gartenaere",
    "Winter, Léon de": "Leon de Winter",
    "ʿOz, Amos": "Amos Oz"
}

In [310]:
# data_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv", index_col = [0])

wiki = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv", index_col = [0])
# wiki = pd.DataFrame()

links_to_scrape = [x for x in data_authors.query("GND_Autor.notna()").sort_values(by='GND_Autor').index if x not in wiki.index]
# links_to_scrape = [data_authors.query("GND_Autor == @x").index[0] for x in list(gnd_authors_to_standard_authors.keys()) + list(gnd_authors_to_wiki_authors.keys())]
# links_to_scrape = [data_authors.query("GND_Autor == @x").index[0] for x in von_authors_to_standard_authors]
# links_to_scrape = test_links

In [311]:
for gnd_link in tqdm(links_to_scrape):
  author = data_authors.loc[gnd_link, 'GND_Autor']

  if pd.isna(author):
    continue
  elif author in gnd_authors_to_wiki_authors.keys():
    author_search = gnd_authors_to_wiki_authors[author]
  elif author in gnd_authors_to_standard_authors.keys():
    author_search = gnd_authors_to_standard_authors[author]
  else:
    author_search = author
  author_search = ' '.join(str(author_search).split(', ')[::-1])

  page = pywikibot.Page(site, author_search)
  if page.isRedirectPage():
    page = page.getRedirectTarget()

  try:
    wiki_author = get_page_stats(page)
    wiki_author['GND_Autor'] = author
    wiki_author['Wiki_Status'] = 'page_found'
    wiki_author = pd.DataFrame(pd.Series(wiki_author)).T
    wiki_author.index = [gnd_link]
  except:
    wiki_author = pd.DataFrame(index = [gnd_link])
    wiki_author['GND_Autor'] = author
    wiki_author['Wiki_Status'] = 'page_not_found'

  if wiki_author['Wiki_Status'].tolist()[0] == 'page_not_found':
    print(f"{gnd_link:<34} {author} (Seite nicht gefunden)")
  else:
    print(f"{gnd_link:<34} {author} ({wiki_author['pageviews_1825d'].tolist()[0]} pageviews)")

  wiki_author.columns = [x if 'GND_' in x or 'Wiki_' in x else 'Wiki_'+x for x in wiki_author.columns]

  wiki = wiki.drop(gnd_link, errors='ignore')
  wiki = pd.concat([wiki, wiki_author])

  0%|          | 0/1 [00:00<?, ?it/s]

https://d-nb.info/gnd/118520512    Chodowiecki, Daniel (56445 pageviews)


In [312]:
wiki = wiki.sort_values(by = 'GND_Autor')
wiki.index.name = 'GND'

exceptions = ['Wiki_title', 'Wiki_url', 'Wiki_first_revision', 'Wiki_Status']
fill_columns = [x for x in wiki if x not in exceptions]
wiki[fill_columns] = wiki[fill_columns].fillna(0)

In [313]:
wiki.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv")

### Import and Merge

In [220]:
wiki = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv", index_col = [0])

In [221]:
cols_to_join = wiki.columns.difference(data_authors.columns)
data_authors = data_authors.join(wiki[cols_to_join], how = 'left')

In [222]:
data_authors['Wiki_pageviews_Summe'] = data_authors['Wiki_pageviews_1825d']
data_authors['Wiki_length_Summe'] = data_authors['Wiki_length']

In [223]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Wiki_title',
    'Wiki_pageviews_Summe', 'Wiki_length_Summe'
]]

Unnamed: 0,Wiki_title,Wiki_pageviews_Summe,Wiki_length_Summe
https://d-nb.info/gnd/118505602,Ingeborg Bachmann,990368.0,66902.0
https://d-nb.info/gnd/118516906,Georg Büchner,900637.0,42336.0
https://d-nb.info/gnd/118519859,Paul Celan,553278.0,53030.0
https://d-nb.info/gnd/118523392,Felix Dahn,68339.0,29834.0
https://d-nb.info/gnd/118527908,Friedrich Dürrenmatt,1225920.0,56558.0
https://d-nb.info/gnd/118536109,Max Frisch,902991.0,113706.0
https://d-nb.info/gnd/118585916,Robert Musil,363942.0,67374.0


# BDSL

### Scrape

In [147]:
!pip3 install google_colab_selenium



In [148]:
import google_colab_selenium as gs
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

In [149]:
driver = gs.Chrome()

<IPython.core.display.Javascript object>

In [150]:
# log in via SUB Göttingen
driver.get("http://han.sub.uni-goettingen.de/han/BDSL")

username = driver.find_element(By.ID, "plainuser")
username.send_keys("000721344262")

password = driver.find_element(By.ID, "password")
password.send_keys("karen1na")

password.send_keys(Keys.RETURN)

In [151]:
gnd_authors_to_bdsl_authors = {
    'Adler, H. G.' : 'Adler, Hans Günther',
    'Berg, O.F.' : 'Berg, Ottokar Franz',
    'Kittler, Friedrich A.': 'Friedrich Kittler',
    'Richter, E. A.' : 'Richter, Erich A.',
}

In [152]:
# data_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv", index_col = [0])

bdsl = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv", index_col = [0])
# bdsl = pd.DataFrame()

links_to_scrape = [x for x in data_authors.query("GND_Autor.notna()").sort_values(by='GND_Autor').index if x not in bdsl.index]
# links_to_scrape = [data_authors.query("GND_Autor == @x").index[0] for x in gnd_authors_to_bdsl_authors.keys()]
# links_to_scrape = test_links

In [153]:
for gnd_link in tqdm(links_to_scrape):
  author = data_authors.loc[gnd_link, 'GND_Autor']

  if pd.isna(author):
    continue
  elif author in gnd_authors_to_bdsl_authors.keys():
    author_search = gnd_authors_to_bdsl_authors[author]
  elif author in gnd_authors_to_standard_authors.keys():
    author_search = gnd_authors_to_standard_authors[author]
  else:
    author_search = author

  time.sleep(random.randint(0, 1))

  # go to search
  driver.find_element(By.LINK_TEXT, "Suche").click()

  # change dropdown to 'Behandelte Person' # oder doch lieber 'Freitext'??
  dropdown_menu = driver.find_element(By.NAME, "DD1")
  select = Select(dropdown_menu)
  select.select_by_value("4")

  # enter author name and search
  input_field = driver.find_element(By.NAME, "SF1")
  input_field.send_keys(author_search)
  input_field.send_keys(Keys.RETURN)

  # get number of results
  page_source = driver.page_source
  hits_str = re.findall("\d* Titel gefunden", page_source)

  if len(hits_str) == 0 or hits_str[0].startswith(' Titel'):
    hits_int = 0
  else:
    hits_int = int(hits_str[0].split(" ")[0])

  # add results to bdsl_DataFrame
  print(f"{gnd_link:<35} {author:<30} ({hits_int} BDSL-Treffer)")
  bdsl.at[gnd_link, 'GND_Autor'] = author
  bdsl.at[gnd_link, 'BDSL_Summe'] = hits_int

  0%|          | 0/696 [00:00<?, ?it/s]

https://d-nb.info/gnd/115478140     Tröster, Johannes              (0 BDSL-Treffer)
https://d-nb.info/gnd/117431133     Tschabuschnigg, Adolf von      (17 BDSL-Treffer)
https://d-nb.info/gnd/117431427     Tscharner, Vincenz Bernhard von (0 BDSL-Treffer)
https://d-nb.info/gnd/1243924519    Tscherning, Andreas            (9 BDSL-Treffer)
https://d-nb.info/gnd/104294728     Tschesch, Johann Theodor von   (0 BDSL-Treffer)
https://d-nb.info/gnd/101147465     Tschink, Cajetan               (0 BDSL-Treffer)
https://d-nb.info/gnd/118624369     Tschirnhaus, Ehrenfried Walther von (0 BDSL-Treffer)
https://d-nb.info/gnd/118624326     Tschudi, Aegidius              (7 BDSL-Treffer)
https://d-nb.info/gnd/117432482     Tschudi, Fridolin              (0 BDSL-Treffer)
https://d-nb.info/gnd/117432504     Tschudi, Friedrich von         (0 BDSL-Treffer)
https://d-nb.info/gnd/115897843X    Tschudi, Johann Heinrich       (0 BDSL-Treffer)
https://d-nb.info/gnd/10434993X     Tucher, Andreas                (0

In [157]:
# Autor:innen, die wegen des Suchnamens zu viele Treffer erzielen, auf 0 setzen
bdsl_reset_authors = ['Albrecht', 'Friedrich, Friedrich', 'Heinrich']

for author in bdsl_reset_authors:
  if author in bdsl['GND_Autor'].tolist():
    bdsl.at[bdsl.query("GND_Autor == @author").index[0], 'BDSL_Summe'] = 0

In [158]:
bdsl['BDSL_Summe'] = bdsl['BDSL_Summe'].fillna(0)
bdsl.index.name = 'GND'

In [159]:
bdsl.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv")

### Import and Merge

In [224]:
bdsl = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv", index_col = [0])

In [225]:
cols_to_join = bdsl.columns.difference(data_authors.columns)
data_authors = data_authors.join(bdsl[cols_to_join], how = 'left')

In [226]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'GND_Autor', 'BDSL_Summe'
]]

Unnamed: 0,GND_Autor,BDSL_Summe
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",2218.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",1775.0
https://d-nb.info/gnd/118519859,"Celan, Paul",2951.0
https://d-nb.info/gnd/118523392,"Dahn, Felix",29.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",1075.0
https://d-nb.info/gnd/118536109,"Frisch, Max",1098.0
https://d-nb.info/gnd/118585916,"Musil, Robert",2622.0


# Reclam

### Scrape

Vgl. https://d-nb.info/010784632

In [305]:
# import requests
# from bs4 import BeautifulSoup
# import re

In [306]:
# def get_response_from_gnd (gnd_url):
#     return requests.get(gnd_url)

In [307]:
# def get_reclam_hits_for_author_name (author):
#     driver.get("https://portal.dnb.de/opac.htm")

#     soup = BeautifulSoup(response.text, "html.parser")
#     target_span = soup.find('span', id='searchResultShortListPageInfo')
#     if target_span:
#       reclam_hits = target_span.get_text()
#       reclam_hits = reclam_hits.split(" von ")
#       reclam_hits = int(reclam_hits[1])

#    return reclam_hits

In [308]:
!pip3 install google_colab_selenium



In [309]:
import google_colab_selenium as gs
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

In [310]:
driver = gs.Chrome()

<IPython.core.display.Javascript object>

In [311]:
driver.get("https://portal.dnb.de/opac.htm")
queryfield = driver.find_element(By.ID, "simpleSearchQueryInputInitial")
queryfield.send_keys("Test")
queryfield.send_keys(Keys.RETURN)

In [312]:
def get_dnb_printview (searchstring):
    driver.get("https://portal.dnb.de/opac.htm")

    expertensuche = driver.find_element(By.ID, "cqlModeCheckbox")
    expertensuche.click()

    queryfield = driver.find_element(By.ID, "simpleSearchQueryInputWithExistingQuery")
    queryfield.clear()
    queryfield.send_keys(searchstring)

    searchbutton = driver.find_element(By.ID, "simpleSearchButton")
    searchbutton.click()

    if driver.find_elements(By.ID, "linkToPrintPreviewShortList"):
      printview_link = driver.find_element(By.ID, "linkToPrintPreviewShortList")
      printview_link.click()
      return driver.page_source
    else:
      return ''

    return driver.page_source

In [313]:
def get_hits_from_dnb_printview (dnb_printview):
    reclam_hits = 0

    searchresult = re.findall(" von insgesamt \\d*", dnb_printview)
    if len(searchresult) > 0:
        searchresult = searchresult[0].split(" ")
        reclam_hits = int(searchresult[-1])

    return reclam_hits

In [314]:
def get_reclam_ubs_from_dnb_printview (dnb_printview):
    reclam_ubs = []

    searchresults_a = re.findall("Reclams Universal-Bibliothek ; Nr. \\d+", dnb_printview)
    searchresults_b = re.findall("Reclams Universal-Bibliothek ; Bd. \\d+", dnb_printview)
    searchresults = searchresults_a + searchresults_b
    if len(searchresults) > 0:
        for searchresult in searchresults:
            searchresult = searchresult.split(" ")
            reclam_ubs.append(int(searchresult[-1]))

    reclam_ubs = sorted(list(set(reclam_ubs)))

    return reclam_ubs

In [315]:
# data_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv", index_col = [0])

reclam = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/05_reclam.csv", index_col = [0])
# reclam = pd.DataFrame()

links_to_scrape = [x for x in data_authors.query("GND_Autor.notna()").sort_values(by='GND_Autor').index if x not in reclam.index]
# links_to_scrape = data_authors.sample(n=50).index
# links_to_scrape = test_links

In [None]:
for i, gnd_link in enumerate(tqdm(links_to_scrape)):

  author = data_authors.loc[gnd_link, 'GND_Autor']
  if pd.isna(author):
    continue
  else:
    author_search = author

  try:
    reclam.at[gnd_link, 'GND_Autor'] = author

    # searchurl = f"https://portal.dnb.de/opac/simpleSearch?query=partOf%3D010784632+AND+sgt%3DB+AND+atr%3D%22{author_search}%22&cqlMode=true"
    # response = get_response_from_gnd(searchurl)
    # reclam_hits_tokens = get_reclam_hits_from_gnd_response(response)
    # reclam.at[gnd_link, 'Reclam_tokens_Summe'] = reclam_hits_tokens
    # print(f"{gnd_link} {author} ({reclam_hits_tokens} Reclam-Treffer)")

    searchstring = f"partOf=010784632 AND atr={author_search}" # AND sgt=B für nur Belletristik?
    dnb_printview = get_dnb_printview(searchstring)
    reclam_hits_tokens = get_hits_from_dnb_printview(dnb_printview)
    reclam_ubs = get_reclam_ubs_from_dnb_printview(dnb_printview)
    reclam.at[gnd_link, 'Reclam_tokens_Summe'] = reclam_hits_tokens
    reclam.at[gnd_link, 'Reclam_types_Summe'] = len(reclam_ubs)
    reclam.at[gnd_link, 'Reclam_UBs'] = ' + '.join([str(x) for x in reclam_ubs])
    print(f"{gnd_link:<35} {author:<30} ({reclam_hits_tokens} Reclam-Treffer)")

    time.sleep(random.randint(0, 3))

  except:
    print(f"{gnd_link:<35} {author:<30} (Fehler)")

  0%|          | 0/1331 [00:00<?, ?it/s]

https://d-nb.info/gnd/117366714     Schwarz, Georg                 (0 Reclam-Treffer)
https://d-nb.info/gnd/118611941     Schwarz, Hans                  (0 Reclam-Treffer)
https://d-nb.info/gnd/117367893     Schwarz, Sophie                (0 Reclam-Treffer)
https://d-nb.info/gnd/118973940     Schwarz-Gardos, Alice          (0 Reclam-Treffer)
https://d-nb.info/gnd/11893838X     Schwarze, Hans Dieter          (0 Reclam-Treffer)
https://d-nb.info/gnd/11941144X     Schwarzenberg, Friedrich von   (0 Reclam-Treffer)
https://d-nb.info/gnd/118795759     Schwarzenberg, Johann von      (0 Reclam-Treffer)
https://d-nb.info/gnd/174300468     Schwarzkogler, Rudolf          (0 Reclam-Treffer)
https://d-nb.info/gnd/117388335     Schwarzkopf, Nikolaus          (0 Reclam-Treffer)
https://d-nb.info/gnd/1243877707    Schwarzschild, Leopold         (0 Reclam-Treffer)
https://d-nb.info/gnd/108011801     Schwarzwälder Prediger         (0 Reclam-Treffer)
https://d-nb.info/gnd/11954489X     Schwedhelm, Karl  

In [None]:
# Autor:innen, die wegen des Suchnamens zu viele Treffer erzielen, auf 0 setzen
reclam_reset_authors = ['Albrecht', 'Friedrich, Friedrich', 'Heinrich']

for author in reclam_reset_authors:
  if author in reclam['GND_Autor'].tolist():
    reclam.at[reclam.query("GND_Autor == @author").index[0], 'Reclam_tokens_Summe'] = 0
    reclam.at[reclam.query("GND_Autor == @author").index[0], 'Reclam_types_Summe'] = 0
    reclam.at[reclam.query("GND_Autor == @author").index[0], 'Reclam_UBs'] = float('NaN')

In [None]:
reclam = reclam.sort_values(by = 'GND_Autor')
reclam.index.name = 'GND'

In [None]:
reclam.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/05_reclam.csv")

### Import and Merge

In [227]:
reclam = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/05_reclam.csv", index_col = [0])

In [228]:
cols_to_join = reclam.columns.difference(data_authors.columns)
data_authors = data_authors.join(reclam[cols_to_join], how = 'left')

In [229]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'GND_Autor',
    'Reclam_tokens_Summe', 'Reclam_types_Summe', 'Reclam_UBs'
]]

Unnamed: 0,GND_Autor,Reclam_tokens_Summe,Reclam_types_Summe,Reclam_UBs
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",8.0,1.0,8008
https://d-nb.info/gnd/118516906,"Büchner, Georg",65.0,14.0,20 + 6060 + 7733 + 7955 + 8210 + 9347 + 9486 +...
https://d-nb.info/gnd/118519859,"Celan, Paul",0.0,0.0,
https://d-nb.info/gnd/118523392,"Dahn, Felix",0.0,0.0,
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",2.0,1.0,347
https://d-nb.info/gnd/118536109,"Frisch, Max",5.0,2.0,1131 + 8306
https://d-nb.info/gnd/118585916,"Musil, Robert",12.0,8.0,1146 + 8526 + 18789 + 18797 + 18990 + 18991 + ...


# Check and Export

In [230]:
exceptions = [
    '_Autor',
    '_title', '_url', '_first_revision', '_Status',
    '_Berufe', '_Geburtsjahr', '_Sterbejahr', '_Gender', '_Laender', '_deutsch', '_Schriftsteller',
    '_id', '_article', '_source',
    '_UBs',
]
fill_columns = [x for x in data_authors if not any(y in x for y in exceptions)]
data_authors[fill_columns] = data_authors[fill_columns].fillna(0)

data_authors.index.name = 'GND'
data_authors = data_authors.sort_values(by = 'GND_Autor')

In [231]:
data_authors.shape

(7666, 93)

In [232]:
data_authors.loc[test_links][[
    'GND_Autor', 'Leselisten_Autor', 'Schule_Autor',
    'GND_Gender', 'GND_Laender',
    'Leselisten_Summe', 'Schule_Summe', 'BDSL_Summe', 'Wiki_pageviews_Summe', 'Reclam_tokens_Summe', 'Killy_length_Summe'
]]

Unnamed: 0_level_0,GND_Autor,Leselisten_Autor,Schule_Autor,GND_Gender,GND_Laender,Leselisten_Summe,Schule_Summe,BDSL_Summe,Wiki_pageviews_Summe,Reclam_tokens_Summe,Killy_length_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg","Bachmann, Ingeborg",Bachmann,female,Italien (XA-IT) + Österreich (XA-AT),78.389635,18.0,2218.0,990368.0,8.0,2472.0
https://d-nb.info/gnd/118516906,"Büchner, Georg","Büchner, Georg",Büchner,male,Deutschland (XA-DE) + Frankreich (XA-FR) + Sch...,94.503444,28.0,1775.0,900637.0,65.0,3260.0
https://d-nb.info/gnd/118519859,"Celan, Paul","Celan, Paul",Celan,male,Frankreich (XA-FR) + Jüdischer Kulturkreis (Re...,59.005877,5.0,2951.0,553278.0,0.0,2998.0
https://d-nb.info/gnd/118523392,"Dahn, Felix","Dahn, Felix",,male,Deutschland (XA-DE),1.0,0.0,29.0,68339.0,0.0,953.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich","Dürrenmatt, Friedrich",Dürrenmatt,male,Deutschland (XA-DE) + Schweiz (XA-CH),51.812558,33.0,1075.0,1225920.0,2.0,4299.0
https://d-nb.info/gnd/118536109,"Frisch, Max","Frisch, Max",Frisch,male,Deutschland (XA-DE) + Italien (XA-IT) + Schwei...,63.029624,28.0,1098.0,902991.0,5.0,3088.0
https://d-nb.info/gnd/118585916,"Musil, Robert","Musil, Robert",Musil,male,Deutschland (XA-DE) + Italien (XA-IT) + Schwei...,61.488971,9.0,2622.0,363942.0,12.0,3571.0


In [233]:
data_authors.query("Wiki_pageviews_Summe.isna()")[[
    'GND_Autor', 'Wiki_pageviews_Summe', 'Wiki_length_Summe'
]]

Unnamed: 0_level_0,GND_Autor,Wiki_pageviews_Summe,Wiki_length_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [234]:
data_authors['GND_Gender'].value_counts()

GND_Gender
male        5801
notKnown     970
female       895
Name: count, dtype: int64

In [235]:
data_authors.query("GND_Gender == 'notKnown'")['GND_Autor']

GND
https://d-nb.info/gnd/124438482                    Acidalius, Valens
https://d-nb.info/gnd/1110188102                   Ackermann, Werner
https://d-nb.info/gnd/122079566     Acxtelmeier, Stanislaus Reinhard
https://d-nb.info/gnd/12005860X                       Adam, Melchior
https://d-nb.info/gnd/123438675                 Adami, Johann Samuel
                                                  ...               
https://d-nb.info/gnd/1011517035                        Zumpf, Peter
https://d-nb.info/gnd/1055385088                     Zwick, Johannes
https://d-nb.info/gnd/1056159588                      Zwingäuer, Der
https://d-nb.info/gnd/1108787355    Österreichischer Bibelübersetzer
https://d-nb.info/gnd/119879727                       Špán, Vavřinec
Name: GND_Autor, Length: 970, dtype: object

In [236]:
data_authors.query("GND_Gender == '<http://'")['GND_Autor']

Series([], Name: GND_Autor, dtype: object)

In [238]:
data_authors.index.value_counts()

GND
https://d-nb.info/gnd/118500015    1
https://d-nb.info/gnd/116250836    1
https://d-nb.info/gnd/104124172    1
https://d-nb.info/gnd/124472249    1
https://d-nb.info/gnd/116264624    1
                                  ..
https://d-nb.info/gnd/124140289    1
https://d-nb.info/gnd/118927884    1
https://d-nb.info/gnd/118548379    1
https://d-nb.info/gnd/119318679    1
https://d-nb.info/gnd/118855379    1
Name: count, Length: 7666, dtype: int64

In [237]:
data_authors.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv")