# Start

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import plotly.express as px
import regex as re
import time
import random

from tqdm.notebook import tqdm

In [3]:
test_links = [
    'https://d-nb.info/gnd/118505602', # Bachmann
    'https://d-nb.info/gnd/118516906', # Büchner
    'https://d-nb.info/gnd/118519859', # Celan
    'https://d-nb.info/gnd/118523392', # Dahn
    'https://d-nb.info/gnd/118527908', # Dürrenmatt
    'https://d-nb.info/gnd/118536109', # Frisch
    'https://d-nb.info/gnd/118585916', # Musil
]

In [4]:
data_authors = pd.DataFrame()
data_authors.index.name = 'GND'

# Uni-Leselisten

### create unileselisten from unileselisten_titles

In [None]:
mode = 'gewichtet'

In [None]:
unileselisten_titles = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/01_unileselisten_titles.csv", sep = ";")

In [None]:
# Filter
unileselisten_titles = unileselisten_titles.query("GND.notna()")
unileselisten_titles = unileselisten_titles[~unileselisten_titles['Autor'].str.startswith('$', na = False)]

In [None]:
unileselisten_titles.head()

Unnamed: 0,Autor,GND,Geschlecht,Titel,Jahreszahl,Jahreszahl_Statistik,Gattung,Modul_Zeit_vor_17.Jh.,Modul_Sprache_(international),Modul_KJL,...,Salzburg,Stuttgart,Stuttgart 2022,Trier,Tübingen,Wien,Wuppertal,Würzburg,Würzburg_2019,Zürich
28,"Hage, Volker (Hg.)",https://d-nb.info/gnd/115807454,,Lyrik für Leser. Deutsche Gedichte der siebzig...,1981,1981.0,Lyrik,,,,...,,,,,,,,,,
52,"Marsch, Edgar (Hg.)",https://d-nb.info/gnd/13345052X,,Moderne deutsche Naturlyrik,1980,1980.0,Lyrik,,,,...,,,,,,,,,,
107,(Pfaffe Konrad),https://d-nb.info/gnd/118565060,m,Rolandslied,12./13. Jh.,1172.0,,MA,,,...,1.0,,,,,,,,,
108,"Abonji, Melinda Nadj",https://d-nb.info/gnd/129396532,w,Tauben fliegen auf,2010,2010.0,Prosa,,,,...,,,,,,,,,,
109,"Achleitner, Friedrich",https://d-nb.info/gnd/119369125,m,Quadratroman,1973,1973.0,Prosa,,,,...,,,,,,,,,,


In [None]:
unileselisten_titles['Autor'] = [x.rstrip() if pd.notna(x) else x for x in unileselisten_titles['Autor']]
unileselisten_titles['Jahreszahl_Statistik'] = unileselisten_titles['Jahreszahl_Statistik'].astype(float)

In [None]:
unileselisten_titles = unileselisten_titles.rename(columns = {
    'Modul_Zeit_vor_17.Jh.' : 'Modul_Zeit',
    'Modul_Sprache_(international)' : 'Modul_Sprache',
    'Modul: KJL' : 'Modul_KJL',
})

In [None]:
unileselisten_dict = {
    'Aachen' : 'Aachen1',
    'Berlin' : 'FU Berlin',
    'Innsbruck' : 'Innsbruck1',
    'Innsbruck 2023' : 'Innsbruck2',
    'Köln Fundamentum' : 'Köln',
    'LA Aachen' : 'Aachen2',
    'Stuttgart' : 'Stuttgart1',
    'Stuttgart 2022' : 'Stuttgart2',
    'Würzburg' : 'Würzburg1',
    'Würzburg_2019' : 'Würzburg2',
}

unileselisten_titles = unileselisten_titles.rename(columns=unileselisten_dict)

In [None]:
exceptions = [
    'Autor', 'GND', 'Geschlecht', 'Titel', 'Jahreszahl', 'Jahreszahl_Statistik', 'Gattung',
    'Modul_Zeit', 'Modul_Sprache', 'Modul_KJL', 'Dekade', 'Jahrhundert', 'sum'
]
unileselisten = [x for x in unileselisten_titles.columns.tolist() if x not in exceptions]
unileselisten_prefix = ['UniLeselisten_'+x for x in unileselisten]

In [None]:
def convert_to_float(frac_str):
    if type(frac_str) == str:
        frac_str = re.sub(",", ".", frac_str)
    try:
        return float(frac_str)
    except ValueError:
        frac_str = re.sub("⁄", "/", frac_str)
        num, denom = frac_str.split('/')
        try:
            leading, num = num.split(' ')
            whole = float(leading)
        except ValueError:
            whole = 0
        frac = float(num) / float(denom)
        return whole - frac if whole < 0 else whole + frac

unileselisten_titles[unileselisten] = unileselisten_titles[unileselisten].applymap(convert_to_float)
unileselisten_titles[unileselisten] = unileselisten_titles[unileselisten].astype(float)

In [None]:
# alles über 0 auf 1 setzen, falls ungewichtet
if mode == 'ungewichtet':
    unileselisten_titles[unileselisten] = unileselisten_titles[unileselisten].notnull().astype('int')

# NaN mit 0 ersetzen
unileselisten_titles[unileselisten] = unileselisten_titles[unileselisten].fillna(0)

In [None]:
# mit mehreren Autor:innen pro Text umgehen
for multi_author_index in unileselisten_titles.query("GND.str.contains('\+')").index:
  multi_author_series = unileselisten_titles.loc[multi_author_index].copy()
  unileselisten_titles = unileselisten_titles.drop(multi_author_index)

  multi_author_authors = [x.strip() for x in multi_author_series['Autor'].split("/")]
  multi_author_GNDs = [x.strip() for x in multi_author_series['GND'].split("+")]

  for single_author, single_GND in zip(multi_author_authors, multi_author_GNDs):
    single_author_series = multi_author_series.copy()
    single_author_series['Autor'] = single_author
    single_author_series['GND'] = single_GND
    # single_author_series[unileselisten] = single_author_series[unileselisten]/len(multi_author_authors)
    unileselisten_titles = pd.concat([unileselisten_titles, pd.DataFrame(single_author_series).T])

In [None]:
unileselisten_authors = pd.DataFrame()
unileselisten_authors.index = unileselisten_titles.drop_duplicates(subset='GND')['GND']
unileselisten_authors['UniLeselisten_Autor'] = unileselisten_titles.drop_duplicates(subset='GND')['Autor'].tolist()
unileselisten_authors[unileselisten_prefix] = unileselisten_titles.groupby('GND')[unileselisten].sum()

In [None]:
# Jeden Wert durch Gesamtzahl aller Titel in jeweiliger Leseiste teilen
unileselisten_authors.loc[:, unileselisten_prefix] = unileselisten_authors.loc[:, unileselisten_prefix].apply(lambda x: x / x.sum())

In [None]:
unileselisten_authors['UniLeselisten_Summe'] = unileselisten_authors[unileselisten_prefix].sum(axis=1)

In [None]:
unileselisten_authors = unileselisten_authors.sort_values(by = 'UniLeselisten_Autor')

In [None]:
this_test_links = [x for x in test_links if x in unileselisten_authors.index]
unileselisten_authors.loc[this_test_links][[
    'UniLeselisten_Autor',
    'UniLeselisten_Göttingen', 'UniLeselisten_Würzburg1',
    'UniLeselisten_Summe'
]]

Unnamed: 0_level_0,UniLeselisten_Autor,UniLeselisten_Göttingen,UniLeselisten_Würzburg1,UniLeselisten_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",0.015625,0.007663,0.426219
https://d-nb.info/gnd/118516906,"Büchner, Georg",0.03125,0.011494,0.649326
https://d-nb.info/gnd/118519859,"Celan, Paul",0.0,0.011494,0.285438
https://d-nb.info/gnd/118523392,"Dahn, Felix",0.0,0.0,0.001267
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",0.015625,0.003831,0.310911
https://d-nb.info/gnd/118536109,"Frisch, Max",0.015625,0.007663,0.332953
https://d-nb.info/gnd/118585916,"Musil, Robert",0.015625,0.015326,0.367633


In [None]:
unileselisten_authors[unileselisten_prefix].sum().head()

UniLeselisten_Aachen2      1.0
UniLeselisten_Aachen1      1.0
UniLeselisten_Augsburg     1.0
UniLeselisten_FU Berlin    1.0
UniLeselisten_Bochum       1.0
dtype: object

In [None]:
unileselisten_authors.shape

(911, 44)

In [None]:
unileselisten_authors.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/01_unileselisten.csv")

### Import and Merge

In [5]:
unileselisten_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/01_unileselisten.csv", index_col = [0])

In [6]:
cols_to_join = unileselisten_authors.columns.difference(data_authors.columns)
data_authors = data_authors.join(unileselisten_authors[cols_to_join], how = 'outer')

In [7]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'UniLeselisten_Autor',
    'UniLeselisten_Göttingen', 'UniLeselisten_Würzburg1',
    'UniLeselisten_Summe',
]]

Unnamed: 0_level_0,UniLeselisten_Autor,UniLeselisten_Göttingen,UniLeselisten_Würzburg1,UniLeselisten_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",0.015625,0.007663,0.426219
https://d-nb.info/gnd/118516906,"Büchner, Georg",0.03125,0.011494,0.649326
https://d-nb.info/gnd/118519859,"Celan, Paul",0.0,0.011494,0.285438
https://d-nb.info/gnd/118523392,"Dahn, Felix",0.0,0.0,0.001267
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",0.015625,0.003831,0.310911
https://d-nb.info/gnd/118536109,"Frisch, Max",0.015625,0.007663,0.332953
https://d-nb.info/gnd/118585916,"Musil, Robert",0.015625,0.015326,0.367633


# Uni-Lehrveranstaltungen

### create lehrveranstaltungen_combined from lehrveranstaltungen

In [None]:
lehrveranstaltungen = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/10_lehrveranstaltungen.csv", sep = ";", index_col=[0])

In [None]:
from ast import literal_eval

def save_literal_eval (input_string):
    try:
        return literal_eval(input_string)
    except:
        return input_string

lehrveranstaltungen["NER_GNDs_Inhalt"] = lehrveranstaltungen["NER_GNDs_Inhalt"].apply(save_literal_eval)
lehrveranstaltungen["NER_GNDs_Titel"] = lehrveranstaltungen["NER_GNDs_Titel"].apply(save_literal_eval)

In [None]:
lehrveranstaltungen_gnd_correction = {
    # wrong         # correct
    '118517902'  : '118517880',  # Busch, Wilhelm
    '116551968'  : '118515586',  # Brockes, Barthold Heinrich
    '118887211'  : '118887203',  # De Man, Paul
    '1023238497' : '118542192',  # Grillparzer, Franz
    '1012609324' : '118936123',  # Lenz, Michael
    '135147255'  : '118572121',  # Lessing, Gotthold Ephraim
}

def correct_gnd_ids(lst):
    if isinstance(lst, list):
        return [lehrveranstaltungen_gnd_correction.get(item, item) for item in lst]
    return lst

lehrveranstaltungen["NER_GNDs_Inhalt"] = lehrveranstaltungen["NER_GNDs_Inhalt"].apply(correct_gnd_ids)
lehrveranstaltungen["NER_GNDs_Titel"] = lehrveranstaltungen["NER_GNDs_Titel"].apply(correct_gnd_ids)

In [None]:
def merge_values(val1, val2):
    if type(val1) == float and type(val2) == float:
        return float('NaN')
    elif type(val1) == float:
        return val2
    elif type(val2) == float:
        return val1
    else:
        return list(set(val1) | set(val2))

lehrveranstaltungen['NER_GNDs_InhaltTitel'] = lehrveranstaltungen.apply(lambda row: merge_values(row['NER_GNDs_Inhalt'], row['NER_GNDs_Titel']), axis=1)

In [None]:
lehrveranstaltungen_authors = pd.DataFrame()

for university in lehrveranstaltungen['Universität'].unique():
  meta_university = lehrveranstaltungen.query("Universität == @university")

  university_gnd_ids = meta_university.query("NER_GNDs_InhaltTitel.notna()")["NER_GNDs_InhaltTitel"]
  university_gnd_ids = [item for sublist in university_gnd_ids.tolist() for item in sublist]
  university_value_counts = pd.Series(university_gnd_ids).value_counts()
  university_value_counts = university_value_counts / meta_university.shape[0] # durch Zahl der Lehrveranstaltungen teilen
  university_df = university_value_counts.to_frame(name=f"Lehrveranstaltungen_{university}")

  lehrveranstaltungen_authors = lehrveranstaltungen_authors.join(university_df, how='outer')

In [None]:
lehrveranstaltungen_authors = lehrveranstaltungen_authors.fillna(0)
lehrveranstaltungen_authors.index = [f"https://d-nb.info/gnd/{x}" for x in lehrveranstaltungen_authors.index]

In [None]:
lehrveranstaltungen_authors['UniLehrveranstaltungen_Summe'] = lehrveranstaltungen_authors.sum(axis=1)

In [None]:
this_test_links = [x for x in test_links if x in lehrveranstaltungen_authors.index]
lehrveranstaltungen_authors.loc[this_test_links][[
    'UniLehrveranstaltungen_Summe'
]]

Unnamed: 0,UniLehrveranstaltungen_Summe
https://d-nb.info/gnd/118505602,0.117337
https://d-nb.info/gnd/118516906,0.05244
https://d-nb.info/gnd/118519859,0.074729
https://d-nb.info/gnd/118527908,0.044193
https://d-nb.info/gnd/118536109,0.023975
https://d-nb.info/gnd/118585916,0.115489


In [None]:
lehrveranstaltungen_authors.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/10_lehrveranstaltungen_combined.csv")

### Import and Merge

In [8]:
lehrveranstaltungen_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/10_lehrveranstaltungen_combined.csv", index_col = [0])

In [9]:
cols_to_join = lehrveranstaltungen_authors.columns.difference(data_authors.columns)
data_authors = data_authors.join(lehrveranstaltungen_authors[cols_to_join], how = 'outer')

In [10]:
this_test_links = [x for x in test_links if x in lehrveranstaltungen_authors.index]
lehrveranstaltungen_authors.loc[this_test_links][[
    'UniLehrveranstaltungen_Summe'
]]

Unnamed: 0,UniLehrveranstaltungen_Summe
https://d-nb.info/gnd/118505602,0.117337
https://d-nb.info/gnd/118516906,0.05244
https://d-nb.info/gnd/118519859,0.074729
https://d-nb.info/gnd/118527908,0.044193
https://d-nb.info/gnd/118536109,0.023975
https://d-nb.info/gnd/118585916,0.115489


# Schul-Leselisten

### create schulleselisten from schulleselisten_entries

In [None]:
schulleselisten_entries = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/02_schulleselisten_entries.csv", sep=";")

In [None]:
schulleselisten_entries = schulleselisten_entries.query("GND.notna()")

In [None]:
schulleselisten_entries.head()

Unnamed: 0,ID,Bundesland,Nachname_original,Nachname,GND,Geschlecht,Titel,Erscheinungsjahr,Erscheinungsjahr_k,Lit.preis_Name,Lit.preis_Autor_in_Name,Jug.literatur
0,2508,Baden-Württemberg,Äsop,Äsop,https://d-nb.info/gnd/118647180,männlich,Fabeln,unbekannt,unbekannt,,,nein
1,2447,Sachsen-Anhalt,Äsop,Äsop,https://d-nb.info/gnd/118647180,männlich,Fabeln,unbekannt,unbekannt,,,nein
2,1778,Baden-Württemberg,Achebe,Achebe,https://d-nb.info/gnd/118646680,männlich,Okonkwo oder Das Alte stürzt,1959,1958,,,nein
3,297,Hessen,Ade,Ade,https://d-nb.info/gnd/138053669,weiblich,Toni Erdmann (Film),2016,2017,"(mehrere Filmpreise, siehe Wikipedia)",,ja
4,3,Hamburg,Ade,Ade,https://d-nb.info/gnd/138053669,weiblich,Toni Erdmann (Film),2016,2016,"(mehrere Filmpreise, siehe Wikipedia)",,ja


In [None]:
bundeslaender = schulleselisten_entries['Bundesland'].unique()
bundeslaender_prefix = ['SchulLeselisten_'+x for x in bundeslaender]

In [None]:
# mit mehreren Autor:innen pro Text umgehen
for multi_author_index in schulleselisten_entries.query("GND.str.contains('\+')").index:
  multi_author_series = schulleselisten_entries.loc[multi_author_index].copy()
  schulleselisten_entries = schulleselisten_entries.drop(multi_author_index)

  multi_author_authors = [x.strip() for x in multi_author_series['Nachname'].split("/")]
  multi_author_GNDs = [x.strip() for x in multi_author_series['GND'].split("+")]

  for single_author, single_GND in zip(multi_author_authors, multi_author_GNDs):
    single_author_series = multi_author_series.copy()
    single_author_series['Nachname'] = single_author
    single_author_series['GND'] = single_GND
    schulleselisten_entries = pd.concat([schulleselisten_entries, pd.DataFrame(single_author_series).T])

In [None]:
schulleselisten_authors = pd.DataFrame()

for gnd_link in tqdm(schulleselisten_entries['GND'].unique()):
  meta_author = schulleselisten_entries.query("GND == @gnd_link")
  schulleselisten_authors.loc[gnd_link, 'SchulLeselisten_Autor'] = meta_author['Nachname'].tolist()[0]
  for bundesland in bundeslaender:
    meta_author_bundesland = meta_author.query("Bundesland == @bundesland")
    schulleselisten_authors.loc[gnd_link, 'SchulLeselisten_'+bundesland] = meta_author_bundesland.shape[0]

schulleselisten_authors = schulleselisten_authors.fillna(0)
schulleselisten_authors.index.name = 'GND'

  0%|          | 0/674 [00:00<?, ?it/s]

In [None]:
# Jeden Wert durch Gesamtzahl aller Titel in jeweiligem Bundesland teilen
schulleselisten_authors.loc[:, bundeslaender_prefix] = schulleselisten_authors.loc[:, bundeslaender_prefix].apply(lambda x: x / x.sum())

In [None]:
schulleselisten_authors['SchulLeselisten_Summe'] = schulleselisten_authors[bundeslaender_prefix].sum(axis=1)

In [None]:
schulleselisten_authors = schulleselisten_authors.sort_values(by = 'SchulLeselisten_Autor')

In [None]:
schulleselisten_authors.query("SchulLeselisten_Autor == 'Fried'")

Unnamed: 0_level_0,SchulLeselisten_Autor,SchulLeselisten_Baden-Württemberg,SchulLeselisten_Sachsen-Anhalt,SchulLeselisten_Hessen,SchulLeselisten_Hamburg,SchulLeselisten_Niedersachsen,SchulLeselisten_Rheinland-Pfalz,SchulLeselisten_Bayern,SchulLeselisten_Sachsen,SchulLeselisten_Saarland,SchulLeselisten_Mecklenburg-Vorpommern,SchulLeselisten_Bremen,SchulLeselisten_Brandenburg,SchulLeselisten_Nordrhein-Westfalen,SchulLeselisten_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
https://d-nb.info/gnd/115688595,Fried,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,0.0,0.0,0.0,0.0,0.011628
https://d-nb.info/gnd/118703145,Fried,0.00271,0.005305,0.0,0.0,0.003584,0.0,0.003968,0.0,0.0,0.0125,0.033333,0.0,0.0,0.061401


In [None]:
this_test_links = [x for x in test_links if x in schulleselisten_authors.index]
schulleselisten_authors.loc[this_test_links][[
    'SchulLeselisten_Autor',
    'SchulLeselisten_Niedersachsen', 'SchulLeselisten_Bayern',
    'SchulLeselisten_Summe'
]]

Unnamed: 0_level_0,SchulLeselisten_Autor,SchulLeselisten_Niedersachsen,SchulLeselisten_Bayern,SchulLeselisten_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://d-nb.info/gnd/118505602,Bachmann,0.003584,0.003968,0.063482
https://d-nb.info/gnd/118516906,Büchner,0.007168,0.003968,0.197066
https://d-nb.info/gnd/118519859,Celan,0.001792,0.003968,0.015218
https://d-nb.info/gnd/118527908,Dürrenmatt,0.008961,0.02381,0.164674
https://d-nb.info/gnd/118536109,Frisch,0.010753,0.011905,0.118126
https://d-nb.info/gnd/118585916,Musil,0.001792,0.003968,0.065484


In [None]:
schulleselisten_authors.shape

(674, 15)

In [None]:
schulleselisten_authors.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/02_schulleselisten.csv")

### Import and Merge

In [11]:
schulleselisten_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/02_schulleselisten.csv", index_col = [0])

In [12]:
cols_to_join = schulleselisten_authors.columns.difference(data_authors.columns)
data_authors = data_authors.join(schulleselisten_authors[cols_to_join], how = 'outer')

In [13]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'SchulLeselisten_Autor',
    'SchulLeselisten_Niedersachsen', 'SchulLeselisten_Bayern',
    'SchulLeselisten_Summe',
]]

Unnamed: 0,SchulLeselisten_Autor,SchulLeselisten_Niedersachsen,SchulLeselisten_Bayern,SchulLeselisten_Summe
https://d-nb.info/gnd/118505602,Bachmann,0.003584,0.003968,0.063482
https://d-nb.info/gnd/118516906,Büchner,0.007168,0.003968,0.197066
https://d-nb.info/gnd/118519859,Celan,0.001792,0.003968,0.015218
https://d-nb.info/gnd/118523392,,,,
https://d-nb.info/gnd/118527908,Dürrenmatt,0.008961,0.02381,0.164674
https://d-nb.info/gnd/118536109,Frisch,0.010753,0.011905,0.118126
https://d-nb.info/gnd/118585916,Musil,0.001792,0.003968,0.065484


In [14]:
data_authors.shape

(4836, 69)

# Abi

### Import and Merge

In [15]:
abi = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/07_abi.csv", sep=";", index_col = [0])

In [16]:
cols_to_join = abi.columns.difference(data_authors.columns)
data_authors = data_authors.join(abi[cols_to_join], how = 'outer')

In [17]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Abi_Autor',
    'Abi_Summe',
]]

Unnamed: 0,Abi_Autor,Abi_Summe
https://d-nb.info/gnd/118505602,,
https://d-nb.info/gnd/118516906,G. Büchner,33.0
https://d-nb.info/gnd/118519859,,
https://d-nb.info/gnd/118523392,,
https://d-nb.info/gnd/118527908,F. Dürrenmatt,22.0
https://d-nb.info/gnd/118536109,M. Frisch,7.0
https://d-nb.info/gnd/118585916,R. Musil,3.0


# Lexika

### Scrape Killy

In [None]:
!pip3 install google_colab_selenium

Collecting google_colab_selenium
  Downloading google_colab_selenium-1.0.13-py3-none-any.whl (8.1 kB)
Collecting selenium (from google_colab_selenium)
  Downloading selenium-4.21.0-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium->google_colab_selenium)
  Downloading trio-0.25.1-py3-none-any.whl (467 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.7/467.7 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium->google_colab_selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium->google_colab_selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium->google_colab_selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from

In [None]:
import google_colab_selenium as gs
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

In [None]:
driver = gs.Chrome()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# log in via SUB Göttingen
driver.get("https://dbis.ur.de/dbinfo/warpto.php?bib_id=subgo&color=4&titel_id=11553&url=http%3A%2F%2FHAN.SUB.UNI-GOETTINGEN.DE%2Fhan%2Fverfasser-datenbank%2F")

username = driver.find_element(By.ID, "plainuser")
username.send_keys("000721344262")

password = driver.find_element(By.ID, "password")
password.send_keys("karen1na")

password.send_keys(Keys.RETURN)

In [None]:
def get_gnd_link_from_vdbo_page (driver):
  gnd_link = float('NaN')

  if driver.find_elements(By.LINK_TEXT, 'GND'):
    gnd_link = driver.find_element(By.LINK_TEXT, 'GND').get_attribute('href')
    if 'http://' in gnd_link:
      gnd_link = re.sub('http://', 'https://', gnd_link)

  return gnd_link

In [None]:
def get_article_content_from_vdbo_page (driver):
  article_content = driver.find_element(By.ID, "text-container").text
  return article_content

In [None]:
def get_name_from_vdbo_page (driver, mode = 'simple'):
    author = float('NaN')

    if mode == 'simple':
      author = driver.find_element(By.TAG_NAME, "h1").text

    if mode == 'full':
      article_content = get_article_content_from_vdbo_page(driver)
      searchresults = re.findall("\\nNamen\\n(.*?)\\n", article_content)
      if len(searchresults) > 0:
          author = searchresults[0]

    return author

In [None]:
def get_source_from_vdbo_page (driver):
    source = float('NaN')

    article_content = get_article_content_from_vdbo_page(driver)

    if "Verfasserlexikon – Die deutsche Literatur des Mittelalters" in article_content:
      source = "Verfasserlexikon MA"
    elif "Verfasserlexikon – Deutscher Humanismus 1480-1520" in article_content:
        source = "Verfasserlexikon Humanismus 1480–1520"
    elif "Verfasserlexikon – Frühe Neuzeit in Deutschland 1520-1620" in article_content:
      source = "Verfasserlexikon FNZ 1520–1620"
    elif "Verfasserlexikon – Frühe Neuzeit in Deutschland 1620-1720" in article_content:
        source = "Verfasserlexikon FNZ 1520–1620"
    elif "Killy Literaturlexikon – Autoren und Werke" in article_content:
      source = "Killy"

    bandnummer = float('NaN')
    bandnummer_a = re.findall("\nBand\n\\d+", article_content)
    bandnummer_b = re.findall("\nBand \\d+", article_content)
    if len(bandnummer_a) > 0:
      bandnummer = bandnummer_a[0].split("\n")[-1]
    if len(bandnummer_b) > 0:
      bandnummer = bandnummer_b[0].split(" ")[-1]

    if pd.notna(source) and pd.notna(bandnummer):
      source = source + ", Bd. " + bandnummer

    return source

In [None]:
killy = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/06_killy.csv", index_col = [0])
# killy = pd.DataFrame()

In [None]:
for i in tqdm(range(1, 7650)): # max: 7645
    killy_id = "{:0>{}}".format(i, 4)
    if 'Killy_id' in killy.columns.tolist() and 'killy_'+killy_id in killy['Killy_id'].tolist():
        continue

    driver.get(f"https://www-1degruyter-1com-1gzs214jk03c1.han.sub.uni-goettingen.de/database/VDBO/entry/vdbo.killy.{killy_id}/html")

    body_text = driver.find_element(By.TAG_NAME, 'body').text
    if "Your access to the De Gruyter site has been temporarily blocked" in body_text:
        print("Access blocked")
        continue

    if "nicht finden, die Sie aufgerufen haben" in body_text or "could not find the page" in body_text:
        print(f"Seite nicht gefunden (Killy-ID: {killy_id})")
        continue

    time.sleep(random.randint(3, 5))

    gnd_link = get_gnd_link_from_vdbo_page(driver)
    if pd.isna(gnd_link):
        gnd_link = 'GND_Placeholder_Killy_' + killy_id
    article_content = get_article_content_from_vdbo_page(driver)
    article_len = 0 if article_content == '' else len(' '.join(article_content.split("\n")).split(" "))
    author_simple = get_name_from_vdbo_page(driver, mode = 'simple')
    author_full = get_name_from_vdbo_page(driver, mode = 'full')
    source = get_source_from_vdbo_page(driver)

    # killy.at[gnd_link, 'Killy_id'] = 'killy_' + killy_id
    # killy.at[gnd_link, 'Killy_simple_Autor'] = author_simple
    # killy.at[gnd_link, 'Killy_full_Autor'] = author_full
    # killy.at[gnd_link, 'Killy_article'] = article_content
    # killy.at[gnd_link, 'Killy_length_Summe'] = article_len
    # killy.at[gnd_link, 'VDBO_source'] = source

    killy_add = pd.DataFrame({
      'Killy_id' : 'killy_' + killy_id,
      'Killy_simple_Autor' : author_simple,
      'Killy_full_Autor' : author_full,
      'Killy_article' : article_content,
      'Killy_length_Summe' : article_len,
      'VDBO_source' : source},
      index = [gnd_link])
    killy = pd.concat([killy, killy_add])

    print(f"{gnd_link:<34} {author_simple} ({article_len} Wörter Killy) ({source})")

  0%|          | 0/7649 [00:00<?, ?it/s]

https://d-nb.info/gnd/116720964    Hensel, Sophie Friederike (613 Wörter Killy) (Killy, Bd. 5)
Seite nicht gefunden (Killy-ID: 7645)
Seite nicht gefunden (Killy-ID: 7646)
Seite nicht gefunden (Killy-ID: 7647)
Seite nicht gefunden (Killy-ID: 7648)
Seite nicht gefunden (Killy-ID: 7649)


In [None]:
killy_gnd_links_correction = {
    # killy/wrong                         # new/correct
    'https://d-nb.info/gnd/1237852404' : 'https://d-nb.info/gnd/118502255', # Altenberg, Peter
    'https://d-nb.info/gnd/1243925175' : 'https://d-nb.info/gnd/118502786', # Andersch, Alfred (2x gleich?)
    'https://d-nb.info/gnd/1153581523' : 'https://d-nb.info/gnd/118507397', # Bauer, Wolfgang
    'https://d-nb.info/gnd/1099922879' : 'https://d-nb.info/gnd/118509861', # Bernhard, Thomas
    'https://d-nb.info/gnd/1174822937' : 'https://d-nb.info/gnd/11548048X', # Beyer, Marcel
    'https://d-nb.info/gnd/1208457071' : 'https://d-nb.info/gnd/118510649', # Bichsel, Peter
    'https://d-nb.info/gnd/1095799150' : 'https://d-nb.info/gnd/118513435', # Borchardt, Rudolf
    'https://d-nb.info/gnd/133726185'  : 'https://d-nb.info/gnd/118514512', # Brasch, Thomas
    'https://d-nb.info/gnd/1138454273' : 'https://d-nb.info/gnd/118514644', # Braun, Volker
    'https://d-nb.info/gnd/138268665'  : 'https://d-nb.info/gnd/11851587X', # Brückner, Christine
    'https://d-nb.info/gnd/133328562'  : 'https://d-nb.info/gnd/118516906', # Büchner, Georg
    'https://d-nb.info/gnd/7512202-9'  : 'https://d-nb.info/gnd/118516477', # Buber, Martin
    'https://d-nb.info/gnd/1072431262' : 'https://d-nb.info/gnd/118520512', # Chodowiecki, Daniel
    'https://d-nb.info/gnd/1096198584' : 'https://d-nb.info/gnd/118523392', # Dahn, Felix
    'https://d-nb.info/gnd/12155645X'  : 'https://d-nb.info/gnd/121550656', # Damm, Sigrid
    'https://d-nb.info/gnd/1027187870' : 'https://d-nb.info/gnd/119286289', # Elsner, Gisela
    'https://d-nb.info/gnd/174023464'  : 'https://d-nb.info/gnd/118530259', # Ende, Michael
    'https://d-nb.info/gnd/1102938378' : 'https://d-nb.info/gnd/119177773', # Endler, Adolf
    'https://d-nb.info/gnd/1037518098' : 'https://d-nb.info/gnd/118534793', # Frank, Leonhard
    'https://d-nb.info/gnd/121309037'  : 'https://d-nb.info/gnd/118535455', # Freytag, Gustav
    'https://d-nb.info/gnd/1013841387' : 'https://d-nb.info/gnd/119523604', # Funke, Cornelia
    'https://d-nb.info/gnd/1067158162' : 'https://d-nb.info/gnd/115612815', # Geiger, Arno
    'https://d-nb.info/gnd/1173437932' : 'https://d-nb.info/gnd/118538659', # Gerhardt, Paul
    'https://d-nb.info/gnd/1073358569' : 'https://d-nb.info/gnd/118539604', # Glaeser, Ernst
    'https://d-nb.info/gnd/134391314'  : 'https://d-nb.info/gnd/118542265', # Grimm, Wilhelm
    'https://d-nb.info/gnd/1159367094' : 'https://d-nb.info/gnd/118179527', # Haas, Wolf
    'https://d-nb.info/gnd/124939031'  : 'https://d-nb.info/gnd/118544330', # Hacks, Peter
    'https://d-nb.info/gnd/134840070'  : 'https://d-nb.info/gnd/118701606', # Harig, Ludwig
    'https://d-nb.info/gnd/1214200532' : 'https://d-nb.info/gnd/119549867', # Haslinger, Josef
    'https://d-nb.info/gnd/188380329'  : 'https://d-nb.info/gnd/118840991', # Hein, Christoph
    'https://d-nb.info/gnd/1176824139' : 'https://d-nb.info/gnd/118881280', # Hessel, Franz
    'https://d-nb.info/gnd/128686723'  : 'https://d-nb.info/gnd/118551108', # Hille, Peter
    'https://d-nb.info/gnd/7505054-7'  : 'https://d-nb.info/gnd/118706462', # Holl, Elias
    'https://d-nb.info/gnd/1153638819' : 'https://d-nb.info/gnd/119535467', # Hoppe, Felicitas
    'https://d-nb.info/gnd/1205166688' : 'https://d-nb.info/gnd/118557211', # Jean Paul (2x gleich?)
    'https://d-nb.info/gnd/124392585X' : 'https://d-nb.info/gnd/118776592', # Kaleko, Mascha (2x gleich?)
    'https://d-nb.info/gnd/115583920'  : 'https://d-nb.info/gnd/11856109X', # Keller, Gottfried
    'https://d-nb.info/gnd/1243920904' : 'https://d-nb.info/gnd/118561359', # Kempowski, Walter
    'https://d-nb.info/gnd/1230830219' : 'https://d-nb.info/gnd/118562487', # Kirsch, Sarah
    'https://d-nb.info/gnd/7513003-8'  : 'https://d-nb.info/gnd/118562827', # Klee, Paul
    'https://d-nb.info/gnd/1243890681' : 'https://d-nb.info/gnd/118563491', # Kluge, Alexander
    'https://d-nb.info/gnd/1230518142' : 'https://d-nb.info/gnd/118958836', # Königsdorf, Helga
    'https://d-nb.info/gnd/1158911440' : 'https://d-nb.info/gnd/137741189', # Krattner, Franz (2x gleich?)
    'https://d-nb.info/gnd/141148470'  : 'https://d-nb.info/gnd/118715917', # Kretzer, Max
    'https://d-nb.info/gnd/1130190293' : 'https://d-nb.info/gnd/11871595X', # Kreuder, Ernst
    'https://d-nb.info/gnd/1146891407' : 'https://d-nb.info/gnd/11899011X', # Kühn, Dieter
    'https://d-nb.info/gnd/1067155724' : 'https://d-nb.info/gnd/118568051', # Kunert, Günter
    'https://d-nb.info/gnd/1243928654' : 'https://d-nb.info/gnd/118568124', # Kunze, Reiner (2x gleich?)
    'https://d-nb.info/gnd/1243922230' : 'https://d-nb.info/gnd/118570285', # Lavant, Christine (2x gleich?)
    'https://d-nb.info/gnd/1140251791' : 'https://d-nb.info/gnd/118571095', # Lehmann, Wilhelm
    'https://d-nb.info/gnd/1024949737' : 'https://d-nb.info/gnd/119560526', # Lehr, Thomas
    'https://d-nb.info/gnd/1047511703' : 'https://d-nb.info/gnd/118832891', # Lichtenstein, Alfred
    'https://d-nb.info/gnd/17408739X'  : 'https://d-nb.info/gnd/122418980', # Maier, Andreas
    'https://d-nb.info/gnd/110222992X' : 'https://d-nb.info/gnd/118578251', # Marti, Kurt
    'https://d-nb.info/gnd/1244171158' : 'https://d-nb.info/gnd/118577158', # Mann, Klaus
    'https://d-nb.info/gnd/116855258'  : 'https://d-nb.info/gnd/118732048', # Maurer, Georg
    'https://d-nb.info/gnd/1078020566' : 'https://d-nb.info/gnd/118818651', # May, Karl
    'https://d-nb.info/gnd/1219085529' : 'https://d-nb.info/gnd/120020513', # Meinecke, Thomas
    'https://d-nb.info/gnd/14296378X'  : 'https://d-nb.info/gnd/118031198', # Merz, Klaus
    'https://d-nb.info/gnd/1200484592' : 'https://d-nb.info/gnd/11858314X', # Möser, Justus
    'https://d-nb.info/gnd/1025857100' : 'https://d-nb.info/gnd/118585193', # Müller, Robert
    'https://d-nb.info/gnd/1158811195' : 'https://d-nb.info/gnd/118585878', # Murnau, Friedrich Wilhelm
    'https://d-nb.info/gnd/117698005X' : 'https://d-nb.info/gnd/118585916', # Musil, Robert
    'https://d-nb.info/gnd/1073434419' : 'https://d-nb.info/gnd/118587331', # Neumann, Robert
    'https://d-nb.info/gnd/130331511'  : 'https://d-nb.info/gnd/118587668', # Nicolai, Friedrich
    'https://d-nb.info/gnd/1043508473' : 'https://d-nb.info/gnd/118590111', # Opitz, Martin
    'https://d-nb.info/gnd/1019727411' : 'https://d-nb.info/gnd/118594605', # Pirckheimer, Willibald
    'https://d-nb.info/gnd/1243920939' : 'https://d-nb.info/gnd/119330490', # Recheis, Käthe (2x gleich?)
    'https://d-nb.info/gnd/1184512086' : 'https://d-nb.info/gnd/118744690', # Reuter, Christian
    'https://d-nb.info/gnd/1050321308' : 'https://d-nb.info/gnd/118599976', # Reuter, Fritz
    'https://d-nb.info/gnd/1144967775' : 'https://d-nb.info/gnd/122400259', # Richter, Julia
    'https://d-nb.info/gnd/1147495351' : 'https://d-nb.info/gnd/118602667', # Rosegger, Peter
    'https://d-nb.info/gnd/7512201-7'  : 'https://d-nb.info/gnd/118602802', # Rosenzweig, Franz
    'https://d-nb.info/gnd/1080243968' : 'https://d-nb.info/gnd/119202824', # Roth, Friederike
    'https://d-nb.info/gnd/116638249'  : 'https://d-nb.info/gnd/118603140', # Roth, Joseph
    'https://d-nb.info/gnd/1139921320' : 'https://d-nb.info/gnd/118603817', # Rückert, Friedrich
    'https://d-nb.info/gnd/173627587'  : 'https://d-nb.info/gnd/118604597', # Sachs, Hans
    'https://d-nb.info/gnd/7502612-0'  : 'https://d-nb.info/gnd/118607782', # Schinkel, Karl Friedrich
    'https://d-nb.info/gnd/1187369802' : 'https://d-nb.info/gnd/11875968X', # Schmidt, Julian
    'https://d-nb.info/gnd/1055490981' : 'https://d-nb.info/gnd/11860922X', # Schmitt, Carl
    'https://d-nb.info/gnd/173845614'  : 'https://d-nb.info/gnd/119279487', # Schneider, Robert
    'https://d-nb.info/gnd/1022708112' : 'https://d-nb.info/gnd/119013517', # Schütz, Stefan
    'https://d-nb.info/gnd/7508478-8'  : 'https://d-nb.info/gnd/118613154', # Semper, Gottfried
    'https://d-nb.info/gnd/124389170X' : 'https://d-nb.info/gnd/118614444', # Simmel, Mario (2x gleich?)
    'https://d-nb.info/gnd/120990415'  : 'https://d-nb.info/gnd/119228408', # Steiner, Jörg
    'https://d-nb.info/gnd/1055487859' : 'https://d-nb.info/gnd/10127226X', # Heinrich von Sax
    'https://d-nb.info/gnd/1124708286' : 'https://d-nb.info/gnd/118758675', # Schneider, Peter
    'https://d-nb.info/gnd/1146806442' : 'https://d-nb.info/gnd/118625063', # Uhland, Ludwig
    'https://d-nb.info/gnd/17014142X'  : 'https://d-nb.info/gnd/118768395', # Viertel, Berthold
    'https://d-nb.info/gnd/124428800'  : 'https://d-nb.info/gnd/118594117', # Wagner, Richard
    'https://d-nb.info/gnd/131977474'  : 'https://d-nb.info/gnd/118628852', # Walser, Martin
    'https://d-nb.info/gnd/135327369'  : 'https://d-nb.info/gnd/118629867', # Wedekind, Frank .
    'https://d-nb.info/gnd/1235001695' : 'https://d-nb.info/gnd/118630369', # Weise, Christian
    'https://d-nb.info/gnd/1102928240' : 'https://d-nb.info/gnd/118630539', # Weiss, Peter
    'https://d-nb.info/gnd/1020534532' : 'https://d-nb.info/gnd/120295245', # Werner, Markus
    'https://d-nb.info/gnd/1074731123' : 'https://d-nb.info/gnd/118632264', # Wickert, Erwin
    'https://d-nb.info/gnd/1235988406' : 'https://d-nb.info/gnd/118632477', # Wieland, Christoph Martin (2x gleich?)
    'https://d-nb.info/gnd/117404144'  : 'https://d-nb.info/gnd/118881140', # Winkler, Josef
    'https://d-nb.info/gnd/128640812'  : 'https://d-nb.info/gnd/118634666', # Wolf, Christa
    'https://d-nb.info/gnd/189566132'  : 'https://d-nb.info/gnd/11811946X', # Zimmer, Dieter
    'https://d-nb.info/gnd/132022958'  : 'https://d-nb.info/gnd/118773186', # Zorn, Fritz
    'https://d-nb.info/gnd/174167393'  : 'https://d-nb.info/gnd/118580949', # Menzel, Wolfgang
    'https://d-nb.info/gnd/1055256237' : 'https://d-nb.info/gnd/142625027', # Schmidt, Erich
}
killy = killy.rename(index=killy_gnd_links_correction)

In [None]:
killy = killy.sort_values(by='Killy_id')
killy.index.name = 'GND'

In [None]:
killy.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/06_killy.csv")

### Import and Merge

In [18]:
killy = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/06_killy.csv", index_col = [0])

In [19]:
killy_join = killy.copy()

In [20]:
# Seyler, Friederike Sophie + Hensel, Sophie Friederike = gleiche Person
seyler_hensel = killy_join.loc['https://d-nb.info/gnd/116720964']
if seyler_hensel.shape[0] == 2:
  seyler_hensel_unified = seyler_hensel.iloc[0].copy()
  seyler_hensel_unified['Killy_length_Summe'] = seyler_hensel['Killy_length_Summe'].sum()
  seyler_hensel_unified['Killy_simple_Autor'] = ' + '.join(seyler_hensel['Killy_simple_Autor'])
  seyler_hensel_unified['Killy_full_Autor'] = ' + '.join(seyler_hensel['Killy_full_Autor'])

  killy_join = killy_join.drop('https://d-nb.info/gnd/116720964')
  killy_join = pd.concat([killy_join, pd.DataFrame(seyler_hensel_unified).T])

In [21]:
killy_join = killy_join.loc[[x for x in killy_join.index if pd.notna(x) and 'd-nb' in x]].copy()
killy_join = killy_join.drop("Killy_article", axis = 'columns')

In [22]:
lexika = killy_join.copy()
lexika['Lexika_Summe'] = lexika['Killy_length_Summe']

In [23]:
cols_to_join = lexika.columns.difference(data_authors.columns)
data_authors = data_authors.join(lexika[cols_to_join], how = 'outer')

In [24]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Killy_simple_Autor',
    'Killy_length_Summe',
    'Lexika_Summe'
]]

Unnamed: 0,Killy_simple_Autor,Killy_length_Summe,Lexika_Summe
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",2472.0,2472.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",3260.0,3260.0
https://d-nb.info/gnd/118519859,"Celan, Paul",2998.0,2998.0
https://d-nb.info/gnd/118523392,"Dahn, Felix",953.0,953.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",4299.0,4299.0
https://d-nb.info/gnd/118536109,"Frisch, Max",3088.0,3088.0
https://d-nb.info/gnd/118585916,"Musil, Robert",3571.0,3571.0


# Verlagsreihen

### Scrape Reclam

In [None]:
import requests
from bs4 import BeautifulSoup
import re

In [None]:
def get_response_from_gnd (gnd_url):
    return requests.get(gnd_url)

In [None]:
def get_gnd_link_from_gnd_response (response):
    gnd_link = float('nan')

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Link zu diesem Datensatz':
            gnd_link = td_elements[i+1].text.strip()

    return gnd_link

In [None]:
def get_ub_from_gnd_response (response):
    ub = []

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Beziehungen':
            ub = td_elements[i+1].text.strip()
            ub = re.search(r'Reclam.*?(\d+)', ub)
            if ub:
              ub = ub.group(1)
            else:
              ub = '?'

    return ub

In [None]:
def get_title_from_gnd_response (response):
    title = float('nan')

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Titel':
            title = td_elements[i+1].text.strip()

    return title

In [None]:
def get_time_from_gnd_response (response):
    time = float('NaN')

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Zeitliche Einordnung':
            time = td_elements[i+1].text.strip()
            time = re.findall("\\d+", time)
            if time:
              time = time[0]

    return time

In [None]:
def get_personen_from_gnd_response (response):
    personen = []

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Person(en)':
            personen = td_elements[i+1].text.strip()
            personen = personen.split(")")
            personen = [x+")" for x in personen[:-1]]

    return personen

In [None]:
def get_personen_gnd_links_from_gnd_response (response, mode = 'all'):
    personen_links = []

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Person(en)':
            td_element = td_elements[i+1]
            a_elements = td_element.findAll('a')
            for a_element in a_elements:
                if mode == 'verfasser' and '(Verfasser)' not in str(a_element):
                    continue
                personen_link = re.findall('(?<=idn%3D)\\d+.?(?=">)', str(a_element))
                if personen_link:
                    personen_link = "https://d-nb.info/gnd/"+personen_link[0]
                    personen_links.append(personen_link)

    return personen_links

In [None]:
def get_sachgruppen_from_gnd_response (response):
    sachgruppen = []

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Sachgruppe(n)':
            sachgruppen = td_elements[i+1].text.strip()
            sachgruppen = sachgruppen.split(" ; ")

    return sachgruppen

In [None]:
reclam_titles = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/05_reclam_titles.csv", index_col = [0])
# reclam_titles = pd.DataFrame()

In [None]:
for i in tqdm(range(0,14340)): # max: 14336 (31. Mai 2024)
    search_id = "{:0>{}}".format(i, 4)
    if 'Searchresult_ID' in reclam_titles.columns and 'Searchresult_' + search_id in reclam_titles['Searchresult_ID'].tolist():
      continue

    time.sleep(random.randint(0, 3))

    reclam_book = pd.DataFrame()
    try:
      # Nach "partOf=010784632", Ergebnisse zeitaufsteigend sortieren, Ergebnisse durchklicken
      # https://portal.dnb.de/opac/simpleSearch?query=partOf%3D010784632&cqlMode=true&sortOrderIndex=jhr_asc
      response = requests.get(f"https://portal.dnb.de/opac/showFullRecord?currentResultId=partOf%3D010784632+sortBy+jhr%2Fsort.ascending%26any&currentPosition={i}")

      searchresult_gnd = get_gnd_link_from_gnd_response(response)
      searchresult_ub = get_ub_from_gnd_response(response)
      searchresult_jahr = get_time_from_gnd_response(response)
      searchresult_title = get_title_from_gnd_response(response)
      personen = get_personen_from_gnd_response(response)
      personen_gnd_links = get_personen_gnd_links_from_gnd_response(response)
      verfasser = re.sub("\\(Verfasser\\)", "", ' + '.join([x for x in personen if '(Verfasser)' in x]))
      verfasser_gnd_links = get_personen_gnd_links_from_gnd_response(response, mode = 'verfasser')
      sachgruppen = get_sachgruppen_from_gnd_response(response)

      reclam_book.at[i, 'Searchresult_ID'] = 'Searchresult_' + search_id
      reclam_book.at[i, 'Searchresult_GND'] = searchresult_gnd
      reclam_book.at[i, 'UB'] = 'UB ' + searchresult_ub
      reclam_book.at[i, 'Jahr'] = searchresult_jahr
      reclam_book.at[i, 'Titel'] = searchresult_title
      reclam_book.at[i, 'Personen'] = ' + '.join(personen)
      reclam_book.at[i, 'Personen_GND'] = ' + '.join(personen_gnd_links)
      reclam_book.at[i, 'Verfasser'] = verfasser
      reclam_book.at[i, 'Verfasser_GND'] = ' + '.join(verfasser_gnd_links)
      reclam_book.at[i, 'Sachgruppen'] = ' + '.join(sachgruppen)
      reclam_titles = pd.concat([reclam_titles, reclam_book])
      print(f"{search_id} {searchresult_gnd:<30} UB {searchresult_ub:<10} {verfasser:<35} {' + '.join(verfasser_gnd_links)}")

    except:
      print(f"Fehler: Search {i}")

  0%|          | 0/14340 [00:00<?, ?it/s]

Fehler: Search 14336
Fehler: Search 14337
Fehler: Search 14338
Fehler: Search 14339


In [None]:
reclam_titles = reclam_titles.sort_index()

In [None]:
reclam_titles.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/05_reclam_titles.csv")

### create reclam from reclam_titles

In [None]:
reclam_titles = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/05_reclam_titles.csv", index_col = [0])

In [None]:
reclam_titles_filtered = (
    reclam_titles
    .query("Jahr >= 2000")
    .query("Sachgruppen.str.contains('Belletristik', na=False)")
)

In [None]:
reclam = pd.DataFrame()

In [None]:
reclam_all_authors_gnds = reclam_titles_filtered['Verfasser_GND'].copy()
reclam_all_authors_gnds = [x for x in reclam_all_authors_gnds if pd.notna(x) and len(x) > 1]
reclam_all_authors_gnds = [x.split(" + ") for x in reclam_all_authors_gnds]
reclam_all_authors_gnds = [item for sublist in reclam_all_authors_gnds for item in sublist]
reclam_all_authors_gnds = set(reclam_all_authors_gnds)

In [None]:
for reclam_author_gnd in tqdm(reclam_all_authors_gnds):
  reclam_title_author = reclam_titles_filtered.query("Verfasser_GND.str.contains(@reclam_author_gnd, na = False)")
  reclam.at[reclam_author_gnd, 'Reclam_Autor'] = reclam_title_author['Verfasser'].value_counts().index[0]
  reclam.at[reclam_author_gnd, 'Reclam_tokens_Summe'] = reclam_title_author.shape[0]
  reclam.at[reclam_author_gnd, 'Reclam_types_Summe'] = reclam_title_author.drop_duplicates(subset='UB').shape[0]
  reclam.at[reclam_author_gnd, 'Reclam_UBs'] = ' + '.join(reclam_title_author['UB'])

  0%|          | 0/530 [00:00<?, ?it/s]

In [None]:
reclam.index.name = 'GND'
reclam = reclam.sort_values(by = 'Reclam_Autor')

In [None]:
reclam.nlargest(5, "Reclam_types_Summe").head()

Unnamed: 0_level_0,Reclam_Autor,Reclam_tokens_Summe,Reclam_types_Summe,Reclam_UBs
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
https://d-nb.info/gnd/118540238,"Goethe, Johann Wolfgang von",61.0,45.0,UB 83 + UB 11157 + UB 67 + UB 61 + UB 71 + UB ...
https://d-nb.info/gnd/118613723,"Shakespeare, William",65.0,43.0,UB 1707 + UB 9220 + UB 17 + UB 31 + UB 5 + UB ...
https://d-nb.info/gnd/118552465,"Hoffmann, E. T. A.",39.0,26.0,UB 153 + UB 192 + UB 5623 + UB 25 + UB 230 + U...
https://d-nb.info/gnd/118607626,"Schiller, Friedrich",44.0,24.0,UB 12 + UB 8891 + UB 64 + UB 33 + UB 18061 + U...
https://d-nb.info/gnd/118534262,"Fontane, Theodor",29.0,23.0,UB 9487 + UB 8577 + UB 6961 + UB 20063 + UB 76...


In [None]:
reclam.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/05_reclam.csv")

### Import and Merge

In [25]:
reclam = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/05_reclam.csv", index_col = [0])

In [26]:
verlagsreihen = reclam.copy()
verlagsreihen['Verlagsreihen_types_Summe'] = verlagsreihen['Reclam_types_Summe']
verlagsreihen['Verlagsreihen_tokens_Summe'] = verlagsreihen['Reclam_tokens_Summe']

In [27]:
cols_to_join = verlagsreihen.columns.difference(data_authors.columns)
data_authors = data_authors.join(verlagsreihen[cols_to_join], how = 'outer')

In [28]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Reclam_Autor',
    'Reclam_tokens_Summe', 'Reclam_types_Summe', 'Reclam_UBs',
    'Verlagsreihen_types_Summe'
]]

Unnamed: 0,Reclam_Autor,Reclam_tokens_Summe,Reclam_types_Summe,Reclam_UBs,Verlagsreihen_types_Summe
https://d-nb.info/gnd/118505602,,,,,
https://d-nb.info/gnd/118516906,"Büchner, Georg",17.0,11.0,UB 7733 + UB 7733 + UB 7955 + UB 6060 + UB 182...,11.0
https://d-nb.info/gnd/118519859,,,,,
https://d-nb.info/gnd/118523392,,,,,
https://d-nb.info/gnd/118527908,,,,,
https://d-nb.info/gnd/118536109,,,,,
https://d-nb.info/gnd/118585916,"Musil, Robert",7.0,6.0,UB 18990 + UB 18797 + UB 18789 + UB 18991 + UB...,6.0


# Literaturgeschichten

### Import and Merge

In [29]:
beutin = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/literaturgeschichten/final/beutin.csv", index_col=[0], sep=";")
beutin = beutin.dropna(axis=0, how='any', subset=['GND-Nummer'])
beutin.index = 'https://d-nb.info/gnd/' + beutin['GND-Nummer']
beutin = beutin[['Eintrag', 'Seiten', 'Anzahl_Seiten_neu']]
beutin = beutin.rename(columns={'Anzahl_Seiten_neu' : 'Anzahl_Seiten_abs'})
beutin['Anzahl_Seiten_rel'] = beutin['Anzahl_Seiten_abs']/beutin['Anzahl_Seiten_abs'].sum()

brenner = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/literaturgeschichten/final/brenner.csv", index_col=[0], sep=";")
brenner = brenner.dropna(axis=0, how='any', subset=['GND-Nummer'])
brenner.index = 'https://d-nb.info/gnd/' + brenner['GND-Nummer']
brenner = brenner[['Eintrag', 'Seiten', 'Anzahl_Seiten_neu']]
brenner = brenner.rename(columns={'Anzahl_Seiten_neu' : 'Anzahl_Seiten_abs'})
brenner['Anzahl_Seiten_rel'] = brenner['Anzahl_Seiten_abs']/brenner['Anzahl_Seiten_abs'].sum()

In [30]:
litgesch = beutin.join(brenner, how='outer', lsuffix='_Beutin', rsuffix='_Brenner')
litgesch['Litgesch_Seiten_Summe'] = (litgesch['Anzahl_Seiten_rel_Beutin']+litgesch['Anzahl_Seiten_rel_Brenner'])/2

In [31]:
litgesch.head()

Unnamed: 0_level_0,Eintrag_Beutin,Seiten_Beutin,Anzahl_Seiten_abs_Beutin,Anzahl_Seiten_rel_Beutin,Eintrag_Brenner,Seiten_Brenner,Anzahl_Seiten_abs_Brenner,Anzahl_Seiten_rel_Brenner,Litgesch_Seiten_Summe
GND-Nummer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
https://d-nb.info/gnd/100019382,"Avancini, Nicolaus von",133,1.0,0.000254,,,,,
https://d-nb.info/gnd/100085296,,,,,"Clavie`re, Etienne 1735 1793",87.0,1.0,0.000579,
https://d-nb.info/gnd/100170307,"La Calprenède, Gautier Coste de","118, 145",2.0,0.000508,,,,,
https://d-nb.info/gnd/100308023,"Meißner, August Gottlieb",288,1.0,0.000254,,,,,
https://d-nb.info/gnd/100352952,,,,,"Haugwitz, August Adolf von 1645 1707",42.0,1.0,0.000579,


In [32]:
cols_to_join = litgesch.columns.difference(data_authors.columns)
data_authors = data_authors.join(litgesch[cols_to_join], how = 'outer')

In [34]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Eintrag_Beutin', 'Eintrag_Brenner', 'UniLeselisten_Autor',
    'Anzahl_Seiten_abs_Beutin', 'Anzahl_Seiten_abs_Brenner',
]]

Unnamed: 0,Eintrag_Beutin,Eintrag_Brenner,UniLeselisten_Autor,Anzahl_Seiten_abs_Beutin,Anzahl_Seiten_abs_Brenner
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg","Bachmann, Ingeborg 1926 1973","Bachmann, Ingeborg",15.0,2.0
https://d-nb.info/gnd/118516906,"Büchner, Georg","Büchner, Georg 1813 1837","Büchner, Georg",30.0,5.0
https://d-nb.info/gnd/118519859,"Celan, Paul","Celan, Paul eigentl. P. Anczel 1920 1970","Celan, Paul",6.0,1.0
https://d-nb.info/gnd/118523392,"Dahn, Felix","Dahn, Felix 1834 1912","Dahn, Felix",2.0,1.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich","Dürrenmatt, Friedrich 1921 1990","Dürrenmatt, Friedrich",5.0,2.0
https://d-nb.info/gnd/118536109,"Frisch, Max","Frisch, Max 1911 1991","Frisch, Max",12.0,2.0
https://d-nb.info/gnd/118585916,"Musil, Robert","Musil, Robert Edler von 1880 1942","Musil, Robert",8.0,3.0


# GND

### Scrape

In [None]:
import requests
from bs4 import BeautifulSoup
import re

In [None]:
def get_json_from_gnd_url (gnd_url):
    gnd_id = re.findall("[0-9]+X?", gnd_url)

    if len(gnd_id) is not 1:
        print(f"wrong url : {gnd_url}")
        return float('NaN')
    else:
        lobid_url = "https://lobid.org/gnd/"+gnd_id[0]+".json"
        return requests.get(lobid_url).json()

  if len(gnd_id) is not 1:


In [None]:
# data_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv", index_col = [0])

gnd = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv", index_col = [0])
# gnd = pd.DataFrame()

links_to_scrape = [x for x in data_authors.index if x not in gnd.index and 'd-nb' in x]
# links_to_scrape = data_authors.sample(n=10).index
# links_to_scrape = test_links

In [None]:
for i, gnd_link in enumerate(tqdm(links_to_scrape)):
  try:
    response = get_json_from_gnd_url(gnd_link)
    time.sleep(random.randint(0, 1)/2)
  except:
    status_report = f"{gnd_link:<35} (wrong link)"

  try:
    gnd.at[gnd_link, 'GND_Autor'] = response['preferredName']
    status_report = f"{gnd_link:<35} {gnd.at[gnd_link, 'GND_Autor']}"
  except:
    status_report = f"{gnd_link:<35} (Fehler: Autor)"

  try:
    gnd.at[gnd_link, 'GND_Gender'] = response['gender'][0]['label']
  except:
    status_report = status_report + " (Fehler: Gender)"

  try:
    if 'dateOfBirth' in response:
      gnd.at[gnd_link, 'GND_Geburtsjahr'] = int(response['dateOfBirth'][0][:4])
    if 'dateOfDeath' in response:
      gnd.at[gnd_link, 'GND_Sterbejahr'] = int(response['dateOfDeath'][0][:4])
  except:
    status_report = status_report + " (Fehler: Geburtsjahr/Sterbejahr)"

  try:
    gnd.at[gnd_link, 'GND_Laender'] = ' + '.join(sorted([x['label'] for x in response['geographicAreaCode']]))
  except:
    status_report = status_report + " (Fehler: Länder)"

  try:
    gnd.at[gnd_link, 'GND_Berufe'] = ' + '.join(sorted([x['label'] for x in response['professionOrOccupation']]))
  except:
    status_report = status_report + " (Fehler: Berufe)"

  print(status_report)

  if i > 20 and i%50 == 0:
    gnd.index.name = 'GND'
    gnd = gnd.sort_values(by = 'GND_Autor')
    gnd.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv")

  0%|          | 0/400 [00:00<?, ?it/s]

https://d-nb.info/gnd/100085296     Clavière, Étienne
https://d-nb.info/gnd/100170307     La Calprenède, Gautier de Coste de
https://d-nb.info/gnd/1011155311    Krey, Franz
https://d-nb.info/gnd/1011656140    Frauendorfer, Helmuth
https://d-nb.info/gnd/101245009     Riedenburg, Burggraf von
https://d-nb.info/gnd/101245041     Regensburg, Burggraf von
https://d-nb.info/gnd/1012809900    Weber, Hasko
https://d-nb.info/gnd/101416016     Friedrich Christian II., Schleswig-Holstein-Sonderburg-Augustenburg, Herzog (Fehler: Berufe)
https://d-nb.info/gnd/1018480358    Röhler, Hans-Joachim
https://d-nb.info/gnd/1036523853    Chain, Mark
https://d-nb.info/gnd/1037425286    Mazzi, Lisa
https://d-nb.info/gnd/104164999     Widmann, Georg Rudolf
https://d-nb.info/gnd/104234644     Gauß, Carl Friedrich
https://d-nb.info/gnd/1044300825    Melle, Fritz Hendrick
https://d-nb.info/gnd/104734612     Szepansky, Gerda
https://d-nb.info/gnd/105051355X    Wünsche, Günter
https://d-nb.info/gnd/105404101     Eg

In [None]:
gnd.index.name = 'GND'
gnd = gnd.sort_values(by = 'GND_Autor')

In [None]:
laender = [str(x).split(" + ") for x in gnd['GND_Laender']]
laender = [item for sublist in laender for item in sublist]
print(pd.Series(laender).value_counts())

laender_deutsch = ['Deutschland', 'Österreich', 'Schweiz']
GND_deutsch = []
for x in gnd['GND_Laender']:
  if pd.isna(x):
    GND_deutsch.append(x)
  elif any(land in str(x) for land in laender_deutsch):
    GND_deutsch.append(True)
  else:
    GND_deutsch.append(False)
gnd['GND_deutsch'] = GND_deutsch

Deutschland (XA-DE)       8132
Österreich (XA-AT)        1364
Schweiz (XA-CH)            879
USA (XD-US)                684
Frankreich (XA-FR)         594
                          ... 
Kirgisien (XB-KG)            1
Eritrea (XC-ER)              1
Elfenbeinküste (XC-CI)       1
Südafrika                    1
Jamaika (XD-JM)              1
Name: count, Length: 172, dtype: int64


In [None]:
berufe = [str(x).split(" + ") for x in gnd['GND_Berufe']]
berufe = [item for sublist in berufe for item in sublist]
print(pd.Series(berufe).value_counts())

berufe_schriftsteller = ['Schriftsteller', 'Lyriker', 'Dramatiker', 'Librettist', 'Exilschriftsteller']
GND_Schriftsteller = []
for x in gnd['GND_Berufe']:
  if pd.isna(x):
    GND_Schriftsteller.append(x)
  elif any(beruf in str(x) for beruf in berufe_schriftsteller):
    GND_Schriftsteller.append(True)
  else:
    GND_Schriftsteller.append(False)
gnd['GND_Schriftsteller'] = GND_Schriftsteller

Schriftsteller      6360
Schriftstellerin    1278
Dramatiker          1046
Hochschullehrer      960
Übersetzer           778
                    ... 
Website                1
Staatsoberhaupt        1
Gitarristin            1
Künstlerfamilie        1
Chemieingenieur        1
Name: count, Length: 1218, dtype: int64


In [None]:
gnd.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv")

### Import and Merge

In [35]:
gnd = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv", index_col = [0])

In [36]:
cols_to_join = gnd.columns.difference(data_authors.columns)
data_authors = data_authors.join(gnd[cols_to_join], how = 'left')

In [37]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'GND_Autor', 'GND_Gender', 'GND_Geburtsjahr', 'GND_Sterbejahr',
    'GND_Laender', 'GND_Berufe', 'GND_deutsch'
]]

Unnamed: 0,GND_Autor,GND_Gender,GND_Geburtsjahr,GND_Sterbejahr,GND_Laender,GND_Berufe,GND_deutsch
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",female,1926.0,1973.0,Italien (XA-IT) + Österreich (XA-AT),Librettistin + Musikerin + Schriftstellerin,True
https://d-nb.info/gnd/118516906,"Büchner, Georg",male,1813.0,1837.0,Deutschland (XA-DE) + Frankreich (XA-FR) + Sch...,Arzt + Dramatiker + Dramatiker + Schriftstelle...,True
https://d-nb.info/gnd/118519859,"Celan, Paul",male,1920.0,1970.0,Frankreich (XA-FR) + Jüdischer Kulturkreis (Re...,Lektor + Lyriker + Schriftsteller + Übersetzer,True
https://d-nb.info/gnd/118523392,"Dahn, Felix",male,1834.0,1912.0,Deutschland (XA-DE),Dramatiker + Erzähler + Historiker + Jurist + ...,True
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",male,1921.0,1990.0,Deutschland (XA-DE) + Schweiz (XA-CH),Dramatiker + Dramaturg + Grafiker + Librettist...,True
https://d-nb.info/gnd/118536109,"Frisch, Max",male,1911.0,1991.0,Deutschland (XA-DE) + Italien (XA-IT) + Schwei...,Architekt + Dramatiker + Drehbuchautor + Journ...,True
https://d-nb.info/gnd/118585916,"Musil, Robert",male,1880.0,1942.0,Deutschland (XA-DE) + Italien (XA-IT) + Schwei...,Herausgeber + Kritiker + Redakteur + Reserveof...,True


In [38]:
# Gibt es Autor:innen mit unterschiedlichen GND-Links, die den gleichen/ähnlichen GND-Namen haben?
pd.set_option('display.width', 1000)
multi_authors = data_authors['GND_Autor'].value_counts()[data_authors['GND_Autor'].value_counts() > 1].index
allowed = [
    'Mayer, Johann Friedrich', 'Müller, Heinrich',
    'Pistorius, Johann', 'Camerius, Joachim', 'Sommer, Johannes',
    'Forster, Georg', 'Füssli, Johann Heinrich', 'Albinus, JOhann Georg',
    'Hildebrandt, Dieter', 'Praetorius, Johannes', 'Clajus, Johannes',
    'Spanheim, Friedrich', 'Hamann, Johann Georg', 'Gwalther, Rudolf',
    'Gwalther, Rudolf', 'Fürer von Haimendorf, Christoph',
    'Camerarius, Joachim', 'Hermann, Wolfgang', 'Ruland, Martin',
    'Frey, Jacob', 'Fabricius, Johann', 'Schneider, Michael',
    'Albinus, Johann Georg', 'Wagner, Richard', 'Beer, Johann Christoph',
    'Dumas, Alexandre', 'Roth, Gerhard'
]
multi_authors = [x for x in multi_authors if x not in allowed]
for multi_author in multi_authors:
  multi_author_df = data_authors.query("GND_Autor == @multi_author")
  multi_author_df = multi_author_df[['GND_Autor', 'UniLeselisten_Autor', 'SchulLeselisten_Autor', 'Killy_simple_Autor', 'Reclam_Autor', 'Eintrag_Brenner', 'Eintrag_Beutin']]
  print(multi_author_df)
  print("\n")

                                         GND_Autor UniLeselisten_Autor SchulLeselisten_Autor Killy_simple_Autor Reclam_Autor             Eintrag_Brenner    Eintrag_Beutin
https://d-nb.info/gnd/1051836611  Menzel, Wolfgang                 NaN                   NaN                NaN          NaN                         NaN               NaN
https://d-nb.info/gnd/118580949   Menzel, Wolfgang                 NaN                   NaN                NaN          NaN  Menzel, Wolfgang 1798 1873  Menzel, Wolfgang
https://d-nb.info/gnd/174167393   Menzel, Wolfgang                 NaN                   NaN   Menzel, Wolfgang          NaN                         NaN               NaN


                                       GND_Autor UniLeselisten_Autor SchulLeselisten_Autor Killy_simple_Autor Reclam_Autor           Eintrag_Brenner Eintrag_Beutin
https://d-nb.info/gnd/1055256237  Schmidt, Erich                 NaN                   NaN     Schmidt, Erich          NaN                       NaN  

In [39]:
missing_authors = data_authors.query("GND_Autor.isna()")
print(missing_authors[['GND_Autor']])

data_authors = data_authors.drop(missing_authors.index)

                                 GND_Autor
https://d-nb.info/gnd/1092051929       NaN


In [40]:
gnd_authors_to_standard_authors = {
    "Aesopus": "Äsop",
    'Apuleius, Madaurensis': 'Apuleius',
    'Ava, Frau' : 'Frau Ava',
    'Dietmar, von Aist': 'Dietmar von Aist',
    'Eckhart, Meister': 'Meister Eckhart',
    'Elisabeth, Nassau-Saarbrücken, Gräfin': 'Elisabeth von Lothringen',
    'Erasmus, Desiderius': 'Erasmus von Rotterdam',
    'Feuerbach, Paul Johann Anselm, Ritter von' : 'Feuerbach, Anselm von',
    'Fouqué, Caroline de La Motte-' : 'Fouqué, Caroline de La Motte',
    'Fouqué, Friedrich de La Motte-' : 'Fouqué, Friedrich de la Motte',
    'Gottfried, von Straßburg' : 'Gottfried von Straßburg',
    'Hahn-Hahn, Ida, Gräfin': 'Ida Hahn-Hahn',
    'Hartmann, von Aue' : 'Hartmann von Aue',
    'Heinrich, der Gleißner' : 'Heinrich der Gleißner',
    'Heinrich, von Meißen' : 'Frauenlob',
    'Heinrich, von Morungen' : 'Heinrich von Morungen',
    'Heinrich, von Veldeke' : 'Heinrich von Veldeke',
    "Homerus" : "Homer",
    'Knigge, Adolph, Freiherr': 'Knigge, Adolph',
    'Konrad, der Pfaffe': 'Pfaffe Konrad',
    'Konrad, von Würzburg': 'Konrad von Würzburg',
    'Lamprecht, der Pfaffe' : 'Pfaffe Lamprecht',
    "Marlitt, E." : "Marlitt, Eugenie",
    "Mechthild, von Magdeburg": "Mechthild von Magdeburg",
    "Otfrid, von Weißenburg": "Otfrid von Weißenburg",
    'Platen, August, Graf von' : 'Platen, August von',
    "Reinbot, von Durne": "Reinbot von Durne",
    "Reinmar, der Alte": "Reinmar der Alte",
    "Seneca, Lucius Annaeus, Philosophus": "Seneca",
    "Schlegel, Dorothea von": "Schlegel, Dorothea",
    "Sophocles": "Sophokles",
    'Thüring, von Ringoltingen' : 'Thüring von Ringoltingen',
    'Ulrich, von Lichtenstein' : 'Ulrich von Liechtenstein',
    "Vergilius Maro, Publius": "Vergil",
    'Walther, von der Vogelweide' : 'Walther von der Vogelweide',
    "Werner, der Gärtner": "Werner der Gärtner",
    "Wolfram, von Eschenbach": "Wolfram von Eschenbach",
}

In [41]:
von_authors_to_standard_authors_keys = [x for x in data_authors['GND_Autor'] if ', von' in x]
von_authors_to_standard_authors_values = [re.sub(", von", " von", x) for x in von_authors_to_standard_authors_keys]
von_authors_to_standard_authors = dict(zip(von_authors_to_standard_authors_keys, von_authors_to_standard_authors_values))

gnd_authors_to_standard_authors = gnd_authors_to_standard_authors | von_authors_to_standard_authors

# Wikipedia

### Scrape

In [None]:
!pip3 install pywikibot
!pip3 install SPARQLWrapper



In [None]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

def transform_gnd_link_to_wiki_link(gnd_link):
    gnd_id = gnd_link.split("/")[-1]

    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""SELECT DISTINCT ?item ?itemLabel ?sitelink WHERE {{
        VALUES ?gndId {{
          "{gnd_id}"
        }}

        ?item wdt:P227 ?gndId.

        OPTIONAL {{
          ?sitelink schema:about ?item;
            schema:isPartOf <https://de.wikipedia.org/>.
        }}

        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "de". }}
      }}"""

    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    try:
      wiki_link = results['results']['bindings'][0]['sitelink']['value']
    except:
      wiki_link = float('NaN')

    return wiki_link

In [None]:
# Code (leicht angepasst) nach: https://github.com/temporal-communities/wiki-metrix

# Illmer, V. J., Soethaert, B., Welz, L., Fischer, F., & Jäschke, R. (2024, Februar 21).
# Literatur im Wikiversum – Eine praktische Annäherung über API-Abfragen und Wikipedia-Metriken.
# DHd 2024 Quo Vadis DH (DHd2024), Passau, Deutschland. https://doi.org/10.5281/zenodo.10698426

pywikibot_config = r"""# -*- coding: utf-8  -*-


mylang = 'de'
family = 'wikipedia'
usernames['wikipedia']['de'] = 'test'"""

with open('user-config.py', 'w', encoding="utf-8") as f:
    f.write(pywikibot_config)

import pywikibot
import requests
import datetime
import urllib.parse

def get_page_stats(page: pywikibot.Page):
    """
    Get page stats for a given page.
    """

    # Handle redirects
    # page = handle_redirect(page)

    page_content = page.get(force=True)
    length_in_bytes = len(page_content.encode("utf-8"))
    page_revisions = list(page.revisions(reverse=True))

    data = {
        "title": page.title(),
        "url": page.full_url(),
        "length": length_in_bytes,
        "n_contributors": len(page.contributors()),
        "n_revisions": len(page_revisions),
        "n_extlinks": len(list(page.extlinks())),
        "n_langlinks": len(page.langlinks()),
        "n_links": len(list(page.linkedPages())),
        "n_linkshere": len(
            list(page.linkedPages(namespaces=[0], follow_redirects=False))
        ),  # Article namespace only (0)
        "n_categories": len(list(page.categories())),
        "pageviews_365d": get_pageviews(page, days=365),
        "pageviews_730d": get_pageviews(page, days=730),
        "pageviews_1825d": get_pageviews(page, days=1825),
        "first_revision": page_revisions[0].timestamp,
    }

    MW_API_LIMIT = 500
    # Give warning if any value is at the limit
    for key, value in data.items():
        if value == MW_API_LIMIT:
            print(f"Warning: {key} at limit {MW_API_LIMIT}.")

    return data

# Use Wikimedia Pageviews REST API to get pageviews
def get_pageviews(page: pywikibot.Page, days=365):
    lang = page.site.code
    site = page.site.family.name

    # Wikimedia REST API
    # https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews
    # https://wikimedia.org/api/rest_v1/
    end_date = datetime.date(2024, 5, 1) # datetime.date.today() - datetime.timedelta(days=2)  # Two days ago
    start_date = end_date - datetime.timedelta(days=days)  # end_date minus [days] ago

    agent_type = "user"  # user, bot, spider, all-agents
    title_uri = urllib.parse.quote(
        page.title(underscore=True, with_section=False), safe=""
    )  # URI-encoded title, no safe characters
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{lang}.{site}/all-access/{agent_type}/{title_uri}/monthly/{start_date.strftime('%Y%m%d')}/{end_date.strftime('%Y%m%d')}"

    user_agent = f"wiki-metrix (https://github.com/temporal-communities/wiki-metrix) requests/{requests.__version__}"
    response = requests.get(url, headers={"User-Agent": user_agent})

    # if response.status_code != 200:
    #     print(f"Error: {response.status_code} {response.reason}")

    data = response.json()
    pageviews_sum = sum(filter(None, [item["views"] for item in data["items"]]))

    return pageviews_sum

def add_page_stats_to_df (df, wikipedia_article_column = 'Wikipedia_Artikel'):
    site = pywikibot.Site('de', 'wikipedia')  # The site we want to run our bot on
    wikiresults_joined_df = pd.DataFrame()

    for article in tqdm(df[wikipedia_article_column]):
        page = pywikibot.Page(site, article)
        wikiresults = get_page_stats(page)

        wikiresults_df = pd.DataFrame(pd.Series(wikiresults)).T
        wikiresults_joined_df = pd.concat([wikiresults_joined_df, wikiresults_df])

    wikiresults_joined_df = wikiresults_joined_df.reset_index(drop=True)
    df = df.join(wikiresults_joined_df)

    return df

In [None]:
site = pywikibot.Site('de', 'wikipedia')

In [None]:
# data_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv", index_col = [0])

wiki = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv", index_col = [0])
# wiki = pd.DataFrame()

links_to_scrape = [x for x in data_authors.sort_values(by='GND_Autor').index if x not in wiki.index]
# links_to_scrape = [x for x in wiki.query("Wiki_Status != 'page_found'").index if x in data_authors.index]
# links_to_scrape = test_links + ['https://d-nb.info/gnd/123603668']

In [None]:
for i, gnd_link in enumerate(tqdm(links_to_scrape)):
  author = data_authors.loc[gnd_link, 'GND_Autor']
  if type(author)!=str:
    author = author.tolist()[0]
  wiki_link = transform_gnd_link_to_wiki_link(gnd_link)

  if pd.notna(wiki_link):
    wiki_pagename = wiki_link.split("/")[-1]
    page = pywikibot.Page(site, wiki_pagename)
    if page.isRedirectPage():
      page = page.getRedirectTarget()
    try: # wiki_link vorhanden, get_page_stats funktioniert
      wiki_author = get_page_stats(page)
      wiki_author['GND_Autor'] = author
      wiki_author['Wiki_Status'] = 'page_found'
      wiki_author = pd.DataFrame(pd.Series(wiki_author)).T
      wiki_author.index = [gnd_link]
    except: # wiki_link vorhanden, get_page_stats funktioniert nicht
      wiki_author = pd.DataFrame(index = [gnd_link])
      wiki_author['GND_Autor'] = author
      wiki_author['Wiki_Status'] = 'page_found_but_pagestats_error'
  else: # wiki_link nicht vorhanden
    wiki_author = pd.DataFrame(index = [gnd_link])
    wiki_author['GND_Autor'] = author
    wiki_author['Wiki_Status'] = 'page_not_found'

  if wiki_author['Wiki_Status'].tolist()[0] == 'page_found':
    print(f"{gnd_link:<33} {author:<30} {wiki_link:<65} ({wiki_author['pageviews_1825d'].tolist()[0]:<7} pageviews)")
  if wiki_author['Wiki_Status'].tolist()[0] == 'page_found_but_pagestats_error':
    print(f"{gnd_link:<33} {author:<30} {wiki_link:<65} page_found_but_pagestats_error")
  if wiki_author['Wiki_Status'].tolist()[0] == 'page_not_found':
    print(f"{gnd_link:<33} {author:<30} page_not_found")

  wiki_author.columns = [x if 'GND_' in x or 'Wiki_' in x else 'Wiki_'+x for x in wiki_author.columns]

  wiki = wiki.drop(gnd_link, errors='ignore')
  wiki = pd.concat([wiki, wiki_author])

  if i > 20 and i%50 == 0:
    wiki = wiki.sort_values(by = 'GND_Autor')
    wiki.index.name = 'GND'
    exceptions = ['Wiki_title', 'Wiki_url', 'Wiki_first_revision', 'Wiki_Status']
    fill_columns = [x for x in wiki if x not in exceptions]
    wiki[fill_columns] = wiki[fill_columns].fillna(0)
    wiki.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv")

  0%|          | 0/400 [00:00<?, ?it/s]

https://d-nb.info/gnd/118500619   Addison, Joseph                https://de.wikipedia.org/wiki/Joseph_Addison                      (12057   pageviews)
https://d-nb.info/gnd/118501402   Albers, Hans                   https://de.wikipedia.org/wiki/Hans_Albers                         (1187689 pageviews)
https://d-nb.info/gnd/1175257354  Albinus, Christian             page_not_found
https://d-nb.info/gnd/118501593   Albrant, der Meister           https://de.wikipedia.org/wiki/Albrant                             (2848    pageviews)
https://d-nb.info/gnd/119111349   Albrecht III., Bayern-München, Herzog https://de.wikipedia.org/wiki/Albrecht_III._(Bayern)              (53117   pageviews)
https://d-nb.info/gnd/118501771   Alemán, Mateo                  https://de.wikipedia.org/wiki/Mateo_Alem%C3%A1n                   (4044    pageviews)
https://d-nb.info/gnd/118648071   Alexis, Willibald              https://de.wikipedia.org/wiki/Willibald_Alexis                    (29553   pageviews)
https:/

  link._site = pywikibot.Site(lang, source.family.name)


https://d-nb.info/gnd/11851444X   Brandt, Willy                  https://de.wikipedia.org/wiki/Willy_Brandt                        (2862905 pageviews)
https://d-nb.info/gnd/120993201   Brasch, Peter                  https://de.wikipedia.org/wiki/Peter_Brasch                        (154793  pageviews)
https://d-nb.info/gnd/116484179   Brenck-Kalischer, Bess         https://de.wikipedia.org/wiki/Bess_Brenck-Kalischer               (1634    pageviews)
https://d-nb.info/gnd/118515500   Brion, Friederike              https://de.wikipedia.org/wiki/Friederike_Brion                    (97825   pageviews)
https://d-nb.info/gnd/118516213   Brunner, Thomas                https://de.wikipedia.org/wiki/Thomas_Brunner_(Dramatiker)         (973     pageviews)


  wiki[fill_columns] = wiki[fill_columns].fillna(0)


https://d-nb.info/gnd/118516418   Bry, Theodor de                https://de.wikipedia.org/wiki/Theodor_de_Bry                      (32543   pageviews)
https://d-nb.info/gnd/116740906   Brück, Max von                 page_not_found
https://d-nb.info/gnd/118515969   Brüning, Heinrich              https://de.wikipedia.org/wiki/Heinrich_Br%C3%BCning               (457519  pageviews)
https://d-nb.info/gnd/118667823   Buback, Siegfried              https://de.wikipedia.org/wiki/Siegfried_Buback                    (336641  pageviews)
https://d-nb.info/gnd/118517325   Buhmann, Inga                  page_not_found
https://d-nb.info/gnd/118517708   Burke, Edmund                  https://de.wikipedia.org/wiki/Edmund_Burke                        (109324  pageviews)
https://d-nb.info/gnd/118517821   Burte, Hermann                 https://de.wikipedia.org/wiki/Hermann_Burte                       (12092   pageviews)
https://d-nb.info/gnd/118518534   Calvin, Jean                   https://de.wikipedia

  wiki[fill_columns] = wiki[fill_columns].fillna(0)


https://d-nb.info/gnd/118682407   Espronceda, José de            https://de.wikipedia.org/wiki/Jos%C3%A9_de_Espronceda             (3820    pageviews)
https://d-nb.info/gnd/118899619   Esterházy, Péter               https://de.wikipedia.org/wiki/P%C3%A9ter_Esterh%C3%A1zy           (30783   pageviews)
https://d-nb.info/gnd/114191655X  Falk, Victor von               page_not_found
https://d-nb.info/gnd/13181091X   Fenoglio, Marisa               page_not_found
https://d-nb.info/gnd/118880349   Ferdinand IV., Heiliges Römisches Reich, König https://de.wikipedia.org/wiki/Ferdinand_IV._(HRR)                 (40497   pageviews)
https://d-nb.info/gnd/118532723   Feuchtwanger, Marta            https://de.wikipedia.org/wiki/Marta_Feuchtwanger                  (31327   pageviews)
https://d-nb.info/gnd/121137996   Finkelstein, Norman G.         https://de.wikipedia.org/wiki/Norman_Finkelstein                  (140331  pageviews)
https://d-nb.info/gnd/118890263   Fischer, Caroline Auguste      http

  wiki[fill_columns] = wiki[fill_columns].fillna(0)


https://d-nb.info/gnd/118545906   Hardenberg, Karl August von    https://de.wikipedia.org/wiki/Karl_August_von_Hardenberg          (147764  pageviews)
https://d-nb.info/gnd/118545981   Harlan, Veit                   https://de.wikipedia.org/wiki/Veit_Harlan                         (241783  pageviews)
https://d-nb.info/gnd/118546201   Hartlieb, Johannes             https://de.wikipedia.org/wiki/Johannes_Hartlieb                   (12507   pageviews)
https://d-nb.info/gnd/118773356   Hartzenbusch, Juan Eugenio     https://de.wikipedia.org/wiki/Juan_Eugenio_Hartzenbusch           (1316    pageviews)
https://d-nb.info/gnd/139199780   Hauer, Karl                    https://de.wikipedia.org/wiki/Karl_Hauer                          (2500    pageviews)
https://d-nb.info/gnd/119034743   Haupt, Moriz                   https://de.wikipedia.org/wiki/Moriz_Haupt                         (6297    pageviews)
https://d-nb.info/gnd/118546929   Hauptmann, Elisabeth           https://de.wikipedia.org/wiki

  wiki[fill_columns] = wiki[fill_columns].fillna(0)


https://d-nb.info/gnd/12848957X   Klimov, Ėlem G.                https://de.wikipedia.org/wiki/Elem_Germanowitsch_Klimow           (23787   pageviews)
https://d-nb.info/gnd/122368630   Klüssendorf, Angelika          https://de.wikipedia.org/wiki/Angelika_Kl%C3%BCssendorf           (65423   pageviews)
https://d-nb.info/gnd/124137725   Knorr, Peter                   https://de.wikipedia.org/wiki/Peter_Knorr                         (38061   pageviews)
https://d-nb.info/gnd/118822438   Koelbl, Herlinde               https://de.wikipedia.org/wiki/Herlinde_Koelbl                     (71060   pageviews)
https://d-nb.info/gnd/118564366   Koenig, Friedrich              https://de.wikipedia.org/wiki/Friedrich_Koenig                    (20627   pageviews)
https://d-nb.info/gnd/118564943   Kollwitz, Käthe                https://de.wikipedia.org/wiki/K%C3%A4the_Kollwitz                 (754766  pageviews)
https://d-nb.info/gnd/120374943   Krawczyk, Stephan              https://de.wikipedia.org/wiki

  wiki[fill_columns] = wiki[fill_columns].fillna(0)


https://d-nb.info/gnd/135697042   Müller, Christine              page_not_found
https://d-nb.info/gnd/1069335347  Münchmeyer, Heinrich Gotthold  https://de.wikipedia.org/wiki/Heinrich_Gotthold_M%C3%BCnchmeyer   (3005    pageviews)
https://d-nb.info/gnd/118586408   Napoleon I., Frankreich, Kaiser https://de.wikipedia.org/wiki/Napoleon_Bonaparte                  (5862078 pageviews)
https://d-nb.info/gnd/119108291   Naumann, Hans                  https://de.wikipedia.org/wiki/Hans_Naumann_(Medi%C3%A4vist)       (7573    pageviews)
https://d-nb.info/gnd/119331438   Neumann, Friedrich             https://de.wikipedia.org/wiki/Friedrich_Neumann_(Germanist)       (6114    pageviews)
https://d-nb.info/gnd/119081172   Neumann, Gert                  https://de.wikipedia.org/wiki/Gert_Neumann                        (8726    pageviews)
https://d-nb.info/gnd/118587544   Newton, Isaac                  https://de.wikipedia.org/wiki/Isaac_Newton                        (1409036 pageviews)
https://d-nb.

  wiki[fill_columns] = wiki[fill_columns].fillna(0)


https://d-nb.info/gnd/124541372   Runckel, Dorothee Henriette von page_not_found
https://d-nb.info/gnd/11547546X   Runge, Doris                   https://de.wikipedia.org/wiki/Doris_Runge                         (11024   pageviews)
https://d-nb.info/gnd/119368617   Rust, Bernhard                 https://de.wikipedia.org/wiki/Bernhard_Rust                       (95705   pageviews)
https://d-nb.info/gnd/119053500   Ruttmann, Walter               https://de.wikipedia.org/wiki/Walter_Ruttmann                     (25633   pageviews)
https://d-nb.info/gnd/1018480358  Röhler, Hans-Joachim           page_not_found
https://d-nb.info/gnd/118603949   Rühmann, Heinz                 https://de.wikipedia.org/wiki/Heinz_R%C3%BChmann                  (1734392 pageviews)
https://d-nb.info/gnd/128935944   Samson, Horst                  https://de.wikipedia.org/wiki/Horst_Samson                        (5715    pageviews)
https://d-nb.info/gnd/118605348   Sand, George                   https://de.wikipedi

  wiki[fill_columns] = wiki[fill_columns].fillna(0)


https://d-nb.info/gnd/118620746   Tannhäuser                     https://de.wikipedia.org/wiki/Tannh%C3%A4user_(Dichter)           (70493   pageviews)
https://d-nb.info/gnd/11864310X   Tatianus, Syrus                https://de.wikipedia.org/wiki/Tatian                              (15859   pageviews)
https://d-nb.info/gnd/110360869   Taufiq, Suleman                https://de.wikipedia.org/wiki/Suleman_Taufiq                      (5120    pageviews)
https://d-nb.info/gnd/118621211   Tell, Wilhelm, Fiktive Gestalt https://de.wikipedia.org/wiki/Wilhelm_Tell                        (838449  pageviews)
https://d-nb.info/gnd/118621483   Thackeray, William Makepeace   https://de.wikipedia.org/wiki/William_Makepeace_Thackeray         (62133   pageviews)
https://d-nb.info/gnd/131472283   Thalheimer, Michael            https://de.wikipedia.org/wiki/Michael_Thalheimer                  (26191   pageviews)
https://d-nb.info/gnd/118621769   Theocritus                     https://de.wikipedia.org/wiki

In [None]:
wiki = wiki.sort_values(by = 'GND_Autor')
wiki.index.name = 'GND'

In [None]:
exceptions = ['Wiki_title', 'Wiki_url', 'Wiki_first_revision', 'Wiki_Status']
fill_columns = [x for x in wiki if x not in exceptions]
wiki[fill_columns] = wiki[fill_columns].fillna(0)

  wiki[fill_columns] = wiki[fill_columns].fillna(0)


In [None]:
wiki.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv")

### Import and Merge

In [None]:
wiki = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv", index_col = [0])

In [None]:
cols_to_join = wiki.columns.difference(data_authors.columns)
data_authors = data_authors.join(wiki[cols_to_join], how = 'left')

In [None]:
data_authors['Wiki_pageviews_Summe'] = data_authors['Wiki_pageviews_1825d']
data_authors['Wiki_length_Summe'] = data_authors['Wiki_length']

In [None]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Wiki_title',
    'Wiki_pageviews_Summe', 'Wiki_length_Summe'
]]

Unnamed: 0,Wiki_title,Wiki_pageviews_Summe,Wiki_length_Summe
https://d-nb.info/gnd/118505602,Ingeborg Bachmann,990368.0,66902.0
https://d-nb.info/gnd/118516906,Georg Büchner,900637.0,42336.0
https://d-nb.info/gnd/118519859,Paul Celan,553278.0,53030.0
https://d-nb.info/gnd/118523392,Felix Dahn,68339.0,29834.0
https://d-nb.info/gnd/118527908,Friedrich Dürrenmatt,1225920.0,56558.0
https://d-nb.info/gnd/118536109,Max Frisch,902991.0,113706.0
https://d-nb.info/gnd/118585916,Robert Musil,363942.0,67374.0


# BDSL

### Scrape

In [None]:
!pip3 install google_colab_selenium

Collecting google_colab_selenium
  Downloading google_colab_selenium-1.0.14-py3-none-any.whl.metadata (2.7 kB)
Collecting selenium (from google_colab_selenium)
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium->google_colab_selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium->google_colab_selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium->google_colab_selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium->google_colab_selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium->google_colab_selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading google_colab_selenium-1.0.14-py3-none-any.whl (8.2 kB)


In [None]:
import google_colab_selenium as gs
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

In [None]:
driver = gs.Chrome()

<IPython.core.display.Javascript object>

In [None]:
# log in via SUB Göttingen
driver.get("http://han.sub.uni-goettingen.de/han/BDSL")

username = driver.find_element(By.ID, "plainuser")
username.send_keys("000721344262")

password = driver.find_element(By.ID, "password")
password.send_keys("karen1na")

password.send_keys(Keys.RETURN)

In [None]:
def get_bdsl_hits_for_author(author, mode = "4", year_start = "1985"):
  # go to search
  driver.find_element(By.LINK_TEXT, "Suche").click()

  # change mode (4 = Behandelte Person, 11 = Freitext)
  dropdown_menu = driver.find_element(By.NAME, "DD1")
  select = Select(dropdown_menu)
  select.select_by_value(mode)

  # change Jahr von (1985, 20000, etc.)
  dropdown_menu = driver.find_element(By.NAME, "JV")
  select = Select(dropdown_menu)
  select.select_by_value(year_start)

  # enter author name and search
  input_field = driver.find_element(By.NAME, "SF1")
  input_field.send_keys(author)
  input_field.send_keys(Keys.RETURN)

  # get number of results
  page_source = driver.page_source
  hits_str = re.findall("\d* Titel gefunden", page_source)

  if len(hits_str) == 0 or hits_str[0].startswith(' Titel'):
    hits_int = 0
  else:
    hits_int = int(hits_str[0].split(" ")[0])

  return hits_int

In [None]:
gnd_authors_to_bdsl_authors = {
    'Adler, H. G.' : 'Adler, Hans Günther',
    'Berg, O.F.' : 'Berg, Ottokar Franz',
    'Jung, C. G.' : 'Carl Gustav Jung',
    'Kittler, Friedrich A.': 'Friedrich Kittler',
    'Richter, E. A.' : 'Richter, Erich A.',
}

In [None]:
# data_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv", index_col = [0])

bdsl = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv", index_col = [0])
# bdsl = pd.DataFrame()

links_to_scrape = [x for x in data_authors.query("GND_Autor.notna()").sort_values(by='GND_Autor').index if x not in bdsl.query("BDSL_Freitext_ab2000_Summe.notna()").index]
# links_to_scrape = [data_authors.query("GND_Autor == @x").index[0] for x in gnd_authors_to_bdsl_authors.keys()]
# links_to_scrape = test_links
# links_to_scrape = [x for x in bdsl.sort_values(by='GND_Autor').query("BDSL_Freitext_Summe.isna()").index]
# links_to_scrape = links_to_scrape + [data_authors.query("GND_Autor == @x").index[0] for x in gnd_authors_to_bdsl_authors.keys()]

In [None]:
for i, gnd_link in enumerate(tqdm(links_to_scrape)):
  author = data_authors.loc[gnd_link, 'GND_Autor']
  if type(author)!=str:
    author = author.tolist()[0]

  if pd.isna(author):
    continue
  elif author in gnd_authors_to_bdsl_authors.keys():
    author_search = gnd_authors_to_bdsl_authors[author]
  elif author in gnd_authors_to_standard_authors.keys():
    author_search = gnd_authors_to_standard_authors[author]
  else:
    author_search = author

  time.sleep(random.randint(0, 1))

  hits_freitext = get_bdsl_hits_for_author(author_search, mode="11", year_start = "2000")
  hits_behandelteperson = get_bdsl_hits_for_author(author_search, mode="4", year_start = "2000")

  bdsl.at[gnd_link, 'GND_Autor'] = author
  bdsl.at[gnd_link, 'BDSL_Freitext_ab2000_Summe'] = hits_freitext
  bdsl.at[gnd_link, 'BDSL_BehandeltePerson_ab2000_Summe'] = hits_behandelteperson

  print(f"{gnd_link:<35} {author:<45} {hits_freitext:<5} Treffer Freitext    {hits_behandelteperson:<5} Treffer Behandelte Person")

  if i >20 and i%50 == 0:
    bdsl = bdsl.sort_values(by='GND_Autor')
    bdsl.index.name = 'GND'
    bdsl.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv")

  0%|          | 0/415 [00:00<?, ?it/s]

https://d-nb.info/gnd/118500619     Addison, Joseph                               5     Treffer Freitext    0     Treffer Behandelte Person
https://d-nb.info/gnd/118501402     Albers, Hans                                  0     Treffer Freitext    0     Treffer Behandelte Person
https://d-nb.info/gnd/1175257354    Albinus, Christian                            0     Treffer Freitext    0     Treffer Behandelte Person
https://d-nb.info/gnd/118501593     Albrant, der Meister                          1     Treffer Freitext    0     Treffer Behandelte Person
https://d-nb.info/gnd/119111349     Albrecht III., Bayern-München, Herzog         0     Treffer Freitext    0     Treffer Behandelte Person
https://d-nb.info/gnd/118501771     Alemán, Mateo                                 11    Treffer Freitext    0     Treffer Behandelte Person
https://d-nb.info/gnd/118648071     Alexis, Willibald                             53    Treffer Freitext    51    Treffer Behandelte Person
https://d-nb.info/gn

In [None]:
# Autor:innen, die wegen des Suchnamens zu viele Treffer erzielen, auf 0 setzen
bdsl_reset_authors = [
    'Albrecht', 'Friedrich, Friedrich', 'Heinrich',
    'Wolf, A. C. F.', 'Wil', 'Stein, P.', 'Schulz, I.',
    'Schneider, J. H. J.', 'Raabe, W.', 'Moser, C.', 'Mach, E.',
    'Lang, P.', 'Köppe, W.', 'Keller, G.', 'Grimm, J.',
    'Graf, H.', 'Franck, G.', 'Elias, Elias', 'Büchner, G.',
    'Bürger, G. A. ', 'C.M.',
]

for author in bdsl_reset_authors:
  if author in bdsl['GND_Autor'].tolist() and author not in gnd_authors_to_bdsl_authors.keys():
    bdsl.at[bdsl.query("GND_Autor == @author").index[0], 'BDSL_BehandeltePerson_ab1985_Summe'] = 0
    bdsl.at[bdsl.query("GND_Autor == @author").index[0], 'BDSL_Freitext_ab1985_Summe'] = 0
    bdsl.at[bdsl.query("GND_Autor == @author").index[0], 'BDSL_BehandeltePerson_ab2000_Summe'] = 0
    bdsl.at[bdsl.query("GND_Autor == @author").index[0], 'BDSL_Freitext_ab2000_Summe'] = 0

In [None]:
bdsl = bdsl.sort_values(by='GND_Autor')
bdsl.index.name = 'GND'

In [None]:
bdsl.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv")

### Import and Merge

In [None]:
bdsl = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv", index_col = [0])

In [None]:
cols_to_join = bdsl.columns.difference(data_authors.columns)
data_authors = data_authors.join(bdsl[cols_to_join], how = 'left')

In [None]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'GND_Autor',
    'BDSL_Freitext_ab1985_Summe',
    'BDSL_Freitext_ab2000_Summe',
    'BDSL_BehandeltePerson_ab1985_Summe',
    'BDSL_BehandeltePerson_ab2000_Summe',
]]

Unnamed: 0,GND_Autor,BDSL_Freitext_ab1985_Summe,BDSL_Freitext_ab2000_Summe,BDSL_BehandeltePerson_ab1985_Summe,BDSL_BehandeltePerson_ab2000_Summe
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",2532.0,1526.0,2218.0,1351.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",2083.0,1133.0,1775.0,981.0
https://d-nb.info/gnd/118519859,"Celan, Paul",3217.0,2179.0,2951.0,2039.0
https://d-nb.info/gnd/118523392,"Dahn, Felix",30.0,25.0,29.0,24.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",1123.0,657.0,1075.0,638.0
https://d-nb.info/gnd/118536109,"Frisch, Max",1206.0,672.0,1098.0,617.0
https://d-nb.info/gnd/118585916,"Musil, Robert",2949.0,1653.0,2622.0,1495.0


In [None]:
data_authors[[
    'BDSL_Freitext_ab1985_Summe',
    'BDSL_Freitext_ab2000_Summe',
    'BDSL_BehandeltePerson_ab1985_Summe',
    'BDSL_BehandeltePerson_ab2000_Summe',
]].corr()

Unnamed: 0,BDSL_Freitext_ab1985_Summe,BDSL_Freitext_ab2000_Summe,BDSL_BehandeltePerson_ab1985_Summe,BDSL_BehandeltePerson_ab2000_Summe
BDSL_Freitext_ab1985_Summe,1.0,0.992216,0.975939,0.969813
BDSL_Freitext_ab2000_Summe,0.992216,1.0,0.969998,0.975066
BDSL_BehandeltePerson_ab1985_Summe,0.975939,0.969998,1.0,0.993688
BDSL_BehandeltePerson_ab2000_Summe,0.969813,0.975066,0.993688,1.0


In [None]:
data_authors = data_authors.rename(columns={'Litgesch_Seiten_rel':'Litgesch_Seiten_Summe'})

# Lovelybooks

### Scrape

In [None]:
import requests
from bs4 import BeautifulSoup
import re

In [None]:
def get_response_from_url (url):
    return requests.get(url)

In [None]:
def page_found(response):
    soup = BeautifulSoup(response.text, "html.parser")

    if 'Diese Seite wurde nicht gefunden' in soup.text:
        return False
    else:
        return True

In [None]:
def get_heading(response):
    heading = float('NaN')

    soup = BeautifulSoup(response.text, "html.parser")
    h1_elements = soup.findAll('h1')

    if h1_elements:
        heading = h1_elements[0].text.strip()

    return heading

In [None]:
def get_stars(respons):
    stars = float('NaN')

    soup = BeautifulSoup(response.text, "html.parser")
    span_elements = soup.findAll('span')
    bewertung_element = [x for x in span_elements if 'Bewertungen' in x.text]
    if bewertung_element:
        stars = re.findall("^.*(?= Sterne)", bewertung_element[0].text)
        stars = re.sub(",", ".", stars[0])
        stars = float(stars)

    return stars

In [None]:
def get_bewertungen(respons):
    bewertungen = float('NaN')

    soup = BeautifulSoup(response.text, "html.parser")
    span_elements = soup.findAll('span')
    bewertung_element = [x for x in span_elements if 'Bewertungen' in x.text]
    if bewertung_element:
        bewertungen = bewertung_element[0].text.split(" ")[-2]
        bewertungen = re.sub("\\.", "", bewertungen)
        bewertungen = float(bewertungen)

    return bewertungen

In [None]:
def get_bibliotheken(respons):
    bibliotheken = float('NaN')

    soup = BeautifulSoup(response.text, "html.parser")
    div_elements = soup.findAll('div')
    statistik_element = [div for div in div_elements if div.has_attr('class') and any('CommunityStatistics' in cls for cls in div['class'])]
    if statistik_element:
        bibliotheken = re.findall(r'\s(\d+\.?\d+)\sBibliotheken', statistik_element[0].text)
        if bibliotheken:
            bibliotheken = re.sub("\\.", "", bibliotheken[0])
            bibliotheken = float(bibliotheken)
        else:
            bibliotheken = float('NaN')

    return bibliotheken

In [None]:
def get_merkzettel(respons):
    merkzettel = float('NaN')

    soup = BeautifulSoup(response.text, "html.parser")
    div_elements = soup.findAll('div')
    statistik_element = [div for div in div_elements if div.has_attr('class') and any('CommunityStatistics' in cls for cls in div['class'])]
    if statistik_element:
        merkzettel = re.findall(r'\s(\d?\.?\d+)\sMerkzettel', statistik_element[0].text)
        if merkzettel:
            merkzettel = re.sub("\\.", "", merkzettel[0])
            merkzettel = float(merkzettel)
        else:
            merkzettel = float('NaN')

    return merkzettel

In [None]:
def get_aktuell_gelesen(respons):
    aktuell_gelesen = float('NaN')

    soup = BeautifulSoup(response.text, "html.parser")
    div_elements = soup.findAll('div')
    statistik_element = [div for div in div_elements if div.has_attr('class') and any('CommunityStatistics' in cls for cls in div['class'])]
    if statistik_element:
        aktuell_gelesen = re.findall(r'\s(\d?\.?\d+)\sLeser\*innen aktuell gelesen', statistik_element[0].text)
        if aktuell_gelesen:
            aktuell_gelesen = re.sub("\\.", "", aktuell_gelesen[0])
            aktuell_gelesen = float(aktuell_gelesen)
        else:
            aktuell_gelesen = float('NaN')

    return aktuell_gelesen

In [None]:
def get_gefolgt(respons):
    gefolgt = float('NaN')

    soup = BeautifulSoup(response.text, "html.parser")
    div_elements = soup.findAll('div')
    statistik_element = [div for div in div_elements if div.has_attr('class') and any('CommunityStatistics' in cls for cls in div['class'])]
    if statistik_element:
        gefolgt = re.findall(r'\s(\d?\.?\d+)\sLeser\*innen gefolgt', statistik_element[0].text)
        if gefolgt:
            gefolgt = re.sub("\\.", "", gefolgt[0])
            gefolgt = float(gefolgt)
        else:
            gefolgt = float('NaN')

    return gefolgt

In [None]:
# data_authors = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv", index_col = [0])

lovelybooks = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/08_lovelybooks.csv", index_col = [0])
# lovelybooks = pd.DataFrame()

links_to_scrape = [x for x in data_authors.sort_values(by='GND_Autor').index if x not in lovelybooks.index and 'd-nb' in x]
# links_to_scrape = data_authors.sample(n=50).index
# links_to_scrape = test_links

In [None]:
for i, gnd_link in enumerate(tqdm(links_to_scrape)):
  author = data_authors.loc[gnd_link, 'GND_Autor']

  if pd.isna(author):
    continue
  # elif author in gnd_authors_to_lovelybooks_authors.keys():
  #   author_search = gnd_authors_to_lovelybooks_authors[author]
  elif author in gnd_authors_to_standard_authors.keys():
    author_search = gnd_authors_to_standard_authors[author]
  else:
    author_search = ' '.join(author.split(", ")[::-1])
  author_search = re.sub(" ", "-", author_search)
  url = 'https://www.lovelybooks.de/autor/' + author_search + '/'

  time.sleep(random.randint(0, 1))

  response = get_response_from_url(url)

  if page_found(response):
    heading = get_heading(response)
    stars = get_stars(response)
    bewertungen = get_bewertungen(response)
    bibliotheken = get_bibliotheken(response)
    merkzettel = get_merkzettel(response)
    aktuell_gelesen = get_aktuell_gelesen(response)
    gefolgt = get_gefolgt(response)

    lovelybooks.at[gnd_link, 'Lovelybooks_Status'] = 'page_found'
    lovelybooks.at[gnd_link, 'GND_Autor'] = author
    lovelybooks.at[gnd_link, 'Lovelybooks_Autor'] = heading
    lovelybooks.at[gnd_link, 'Lovelybooks_Stars'] = stars
    lovelybooks.at[gnd_link, 'Lovelybooks_Bewertungen'] = bewertungen
    lovelybooks.at[gnd_link, 'Lovelybooks_Bibliotheken'] = bibliotheken
    lovelybooks.at[gnd_link, 'Lovelybooks_Merkzettel'] = merkzettel
    lovelybooks.at[gnd_link, 'Lovelybooks_aktuell_gelesen'] = aktuell_gelesen
    lovelybooks.at[gnd_link, 'Lovelybooks_gefolgt'] = gefolgt

    print(f"{url:<60} page found       {heading:<30} {stars:<3} Sterne, {bewertungen:<7} Bewertungen")

  else:
    lovelybooks.at[gnd_link, 'Lovelybooks_Status'] = 'page_not_found'
    lovelybooks.at[gnd_link, 'GND_Autor'] = author

    print(f"{url:<60} page not found")

  if i > 20 and i%100 == 0:
    lovelybooks.index.name = 'GND'
    lovelybooks = lovelybooks.sort_values(by='GND_Autor')
    lovelybooks.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/08_lovelybooks.csv")

  0%|          | 0/4451 [00:00<?, ?it/s]

https://www.lovelybooks.de/autor/Friedrich-Haug/             page not found
https://www.lovelybooks.de/autor/Helgard-Haug/               page found       Helgard Haug                   4.2 Sterne, 5.0     Bewertungen
https://www.lovelybooks.de/autor/Otto-von-Haugwitz/          page not found
https://www.lovelybooks.de/autor/Carl-Hauptmann/             page found       Carl Hauptmann                 4.0 Sterne, 2.0     Bewertungen
https://www.lovelybooks.de/autor/Gerhart-Hauptmann/          page found       Gerhart Hauptmann              3.4 Sterne, 786.0   Bewertungen
https://www.lovelybooks.de/autor/Auguste-Hauschner/          page found       Auguste Hauschner              2.0 Sterne, 1.0     Bewertungen
https://www.lovelybooks.de/autor/Felix-Hausdorff/            page not found
https://www.lovelybooks.de/autor/Wilhelm-Hausenstein/        page found       Wilhelm Hausenstein            nan Sterne, nan     Bewertungen
https://www.lovelybooks.de/autor/Arnold-Hauser/              page f

In [None]:
lovelybooks.index.name = 'GND'
lovelybooks = lovelybooks.sort_values(by='GND_Autor')

In [None]:
lovelybooks.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/08_lovelybooks.csv")

### Import and Merge

In [None]:
lovelybooks = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/08_lovelybooks.csv", index_col = [0])

In [None]:
lovelybooks = lovelybooks.rename(columns={
    'Lovelybooks_Bewertungen' : 'Lovelybooks_Bewertungen_Summe',
    'Lovelybooks_Bibliotheken' : 'Lovelybooks_Bibliotheken_Summe',
})

In [None]:
cols_to_join = lovelybooks.columns.difference(data_authors.columns)
data_authors = data_authors.join(lovelybooks[cols_to_join], how = 'left')

In [None]:
this_test_links = [x for x in test_links if x in data_authors.index]
data_authors.loc[this_test_links][[
    'Lovelybooks_Autor',
    'Lovelybooks_Bewertungen_Summe',
    'Lovelybooks_Bibliotheken_Summe'
]]

Unnamed: 0,Lovelybooks_Autor,Lovelybooks_Bewertungen_Summe,Lovelybooks_Bibliotheken_Summe
https://d-nb.info/gnd/118505602,Ingeborg Bachmann,312.0,450.0
https://d-nb.info/gnd/118516906,Georg Büchner,1595.0,2289.0
https://d-nb.info/gnd/118519859,Paul Celan,163.0,193.0
https://d-nb.info/gnd/118523392,Felix Dahn,62.0,134.0
https://d-nb.info/gnd/118527908,Friedrich Dürrenmatt,8633.0,8489.0
https://d-nb.info/gnd/118536109,Max Frisch,5879.0,6325.0
https://d-nb.info/gnd/118585916,Robert Musil,512.0,868.0


# Check and Export

In [None]:
#fillna mit 0
exceptions = [
    '_Autor',
    '_title', '_url', '_first_revision', '_Status',
    '_Berufe', '_Geburtsjahr', '_Sterbejahr', '_Gender', '_Laender', '_deutsch', '_Schriftsteller',
    '_id', '_article', '_source',
    '_UBs',
]
fill_columns = [x for x in data_authors if not any(y in x for y in exceptions)]
data_authors[fill_columns] = data_authors[fill_columns].fillna(0)

data_authors.index.name = 'GND'
data_authors = data_authors.sort_values(by = 'GND_Autor')

  data_authors[fill_columns] = data_authors[fill_columns].fillna(0)


In [None]:
data_authors.shape

(11396, 121)

In [None]:
data_authors.loc[test_links][[
    'GND_Autor', 'UniLeselisten_Autor', 'SchulLeselisten_Autor',
    'GND_Gender', 'GND_Laender',
    'UniLeselisten_Summe', 'SchulLeselisten_Summe',
    'BDSL_Freitext_ab1985_Summe', 'BDSL_BehandeltePerson_ab2000_Summe',
    'Wiki_pageviews_Summe', 'Reclam_tokens_Summe', 'Killy_length_Summe',
    'Abi_Summe', 'UniLehrveranstaltungen_Summe',
    'Litgesch_Seiten_Summe',
    # 'Lovelybooks_Bewertungen_Summe', 'Lovelybooks_Bibliotheken_Summe'
]]

Unnamed: 0_level_0,GND_Autor,UniLeselisten_Autor,SchulLeselisten_Autor,GND_Gender,GND_Laender,UniLeselisten_Summe,SchulLeselisten_Summe,BDSL_Freitext_ab1985_Summe,BDSL_BehandeltePerson_ab2000_Summe,Wiki_pageviews_Summe,Reclam_tokens_Summe,Killy_length_Summe,Abi_Summe,UniLehrveranstaltungen_Summe,Litgesch_Seiten_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg","Bachmann, Ingeborg",Bachmann,female,Italien (XA-IT) + Österreich (XA-AT),0.426219,0.063482,2532.0,1351.0,990368.0,0.0,2472.0,0.0,0.117337,0.002485
https://d-nb.info/gnd/118516906,"Büchner, Georg","Büchner, Georg",Büchner,male,Deutschland (XA-DE) + Frankreich (XA-FR) + Sch...,0.649326,0.197066,2083.0,981.0,900637.0,17.0,3260.0,33.0,0.05244,0.00526
https://d-nb.info/gnd/118519859,"Celan, Paul","Celan, Paul",Celan,male,Frankreich (XA-FR) + Jüdischer Kulturkreis (Re...,0.285438,0.015218,3217.0,2039.0,553278.0,0.0,2998.0,0.0,0.074729,0.001052
https://d-nb.info/gnd/118523392,"Dahn, Felix","Dahn, Felix",,male,Deutschland (XA-DE),0.001267,0.0,30.0,24.0,68339.0,0.0,953.0,0.0,0.0,0.000544
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich","Dürrenmatt, Friedrich",Dürrenmatt,male,Deutschland (XA-DE) + Schweiz (XA-CH),0.310911,0.164674,1123.0,638.0,1225920.0,0.0,4299.0,22.0,0.044193,0.001214
https://d-nb.info/gnd/118536109,"Frisch, Max","Frisch, Max",Frisch,male,Deutschland (XA-DE) + Italien (XA-IT) + Schwei...,0.332953,0.118126,1206.0,617.0,902991.0,0.0,3088.0,7.0,0.023975,0.002104
https://d-nb.info/gnd/118585916,"Musil, Robert","Musil, Robert",Musil,male,Deutschland (XA-DE) + Italien (XA-IT) + Schwei...,0.367633,0.065484,2949.0,1495.0,363942.0,7.0,3571.0,3.0,0.115489,0.001885


In [None]:
data_authors.query("Wiki_pageviews_Summe.isna()")[[
    'GND_Autor', 'Wiki_pageviews_Summe', 'Wiki_length_Summe'
]]

Unnamed: 0_level_0,GND_Autor,Wiki_pageviews_Summe,Wiki_length_Summe
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
data_authors['GND_Gender'].value_counts()

Unnamed: 0_level_0,count
GND_Gender,Unnamed: 1_level_1
male,7791
notKnown,1637
female,1568
Männlich,326
Weiblich,44
Unbekannt,29


In [None]:
data_authors.query("GND_Gender == 'notKnown'")['GND_Autor']

Unnamed: 0_level_0,GND_Autor
GND,Unnamed: 1_level_1
https://d-nb.info/gnd/124438482,"Acidalius, Valens"
https://d-nb.info/gnd/1110188102,"Ackermann, Werner"
https://d-nb.info/gnd/122079566,"Acxtelmeier, Stanislaus Reinhard"
https://d-nb.info/gnd/12005860X,"Adam, Melchior"
https://d-nb.info/gnd/123438675,"Adami, Johann Samuel"
...,...
https://d-nb.info/gnd/128748435,"Öhlschläger, Claudia"
https://d-nb.info/gnd/1108787355,Österreichischer Bibelübersetzer
https://d-nb.info/gnd/12339371X,"Čapek, Karel"
https://d-nb.info/gnd/119879727,"Špán, Vavřinec"


In [None]:
data_authors.query("GND_Gender == '<http://'")['GND_Autor']

Unnamed: 0_level_0,GND_Autor
GND,Unnamed: 1_level_1


In [None]:
data_authors.index.value_counts()

Unnamed: 0_level_0,count
GND,Unnamed: 1_level_1
https://d-nb.info/gnd/118818260,2
https://d-nb.info/gnd/1060236745,2
https://d-nb.info/gnd/118572601,2
https://d-nb.info/gnd/118512986,2
https://d-nb.info/gnd/118500015,1
...,...
https://d-nb.info/gnd/118547976,1
https://d-nb.info/gnd/1019257059,1
https://d-nb.info/gnd/118840991,1
https://d-nb.info/gnd/1080156720,1


In [None]:
data_authors.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/data_authors.csv")