In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import plotly.express as px
import regex as re

from tqdm.notebook import tqdm

In [3]:
test_links = [
    'https://d-nb.info/gnd/118500856', # Aeschylus
    'https://d-nb.info/gnd/118501232', # Aichinger
    'https://d-nb.info/gnd/118505602', # Bachmann
    'https://d-nb.info/gnd/118516906', # Büchner
    'https://d-nb.info/gnd/118519859', # Celan
    'https://d-nb.info/gnd/118527908', # Dürrenmatt
]

# Leselisten

In [4]:
mode = 'gewichtet'

In [5]:
leselisten_titles = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/01_leselisten.csv", sep = ";")

In [6]:
leselisten_titles = leselisten_titles.query("GND.notna()")

In [7]:
leselisten_titles.head()

Unnamed: 0,Autor,GND,Geschlecht,Titel,Jahreszahl,Jahreszahl_Statistik,Gattung,Modul_Zeit_vor_17.Jh.,Modul_Sprache_(international),Modul_KJL,...,Salzburg,Stuttgart,Stuttgart 2022,Trier,Tübingen,Wien,Wuppertal,Würzburg,Würzburg_2019,Zürich
0,"$Bahr, Ehrhard (Hg.)",https://d-nb.info/gnd/130674966,,Was ist Aufklärung? Thesen und Definitionen,1974,1974.0,Poetik / Essayistik,,,,...,,,,,,,,,,
1,"$Beers, Anna (Hg.)",https://d-nb.info/gnd/1023005093,,Frauen / Lyrik. Gedichte in deutscher Sprache,2020,2020.0,Lyrik,,,,...,,,1.0,,,,,,,
2,"$Behrens, Katja (Hg.)",https://d-nb.info/gnd/124718124,,Frauenbriefe der Romantik,1982,1982.0,Sonstiges,,,,...,,,,,,,,,,
3,"$Bender, Hans (Hg.)",https://d-nb.info/gnd/118508849,,Deutsche Gedichte: 1930-1960,1983,1983.0,Lyrik,,,,...,,,,,,,,,,
4,"$Best, Otto F. (Hg.)",https://d-nb.info/gnd/116155086,,Theorie des Expressionismus,1986,1986.0,Poetik / Essayistik,,,,...,,,,,,,,,,


In [8]:
leselisten_titles['Autor'] = [x.rstrip() if pd.notna(x) else float('NaN') for x in leselisten_titles['Autor']]
leselisten_titles['Jahreszahl_Statistik'] = leselisten_titles['Jahreszahl_Statistik'].astype(float)

In [9]:
leselisten_titles = leselisten_titles.rename(columns = {
    'Modul_Zeit_vor_17.Jh.' : 'Modul_Zeit',
    'Modul_Sprache_(international)' : 'Modul_Sprache',
    'Modul: KJL' : 'Modul_KJL',
})

In [10]:
leselisten_dict = {
    'Aachen' : 'Aachen1',
    'Berlin' : 'FU Berlin',
    'Innsbruck' : 'Innsbruck1',
    'Innsbruck 2023' : 'Innsbruck2',
    'Köln Fundamentum' : 'Köln',
    'LA Aachen' : 'Aachen2',
    'Stuttgart' : 'Stuttgart1',
    'Stuttgart 2022' : 'Stuttgart2',
    'Würzburg' : 'Würzburg1',
    'Würzburg_2019' : 'Würzburg2',
}

leselisten_titles = leselisten_titles.rename(columns=leselisten_dict)

In [11]:
exceptions = [
    'Autor', 'GND', 'Geschlecht', 'Titel', 'Jahreszahl', 'Jahreszahl_Statistik', 'Gattung',
    'Modul_Zeit', 'Modul_Sprache', 'Modul_KJL', 'Dekade', 'Jahrhundert', 'sum'
]
leselisten = [x for x in leselisten_titles.columns.tolist() if x not in exceptions]

In [12]:
def convert_to_float(frac_str):
    if type(frac_str) == str:
        frac_str = re.sub(",", ".", frac_str)
    try:
        return float(frac_str)
    except ValueError:
        frac_str = re.sub("⁄", "/", frac_str)
        num, denom = frac_str.split('/')
        try:
            leading, num = num.split(' ')
            whole = float(leading)
        except ValueError:
            whole = 0
        frac = float(num) / float(denom)
        return whole - frac if whole < 0 else whole + frac

leselisten_titles[leselisten] = leselisten_titles[leselisten].applymap(convert_to_float)

In [13]:
# in Zahlen konvertieren
leselisten_titles[leselisten] = leselisten_titles[leselisten].astype(float)

# alles über 0 auf 1 setzen;
# auskommentiert für gewichtet, nicht auskommentiert für ungewichtet
if mode == 'ungewichtet':
    leselisten_titles[leselisten] = leselisten_titles[leselisten].notnull().astype('int')

# NaN mit 0 ersetzen
leselisten_titles[leselisten] = leselisten_titles[leselisten].fillna(0)

# Texte ausschließen, die mit $ beginnen (Anthologien)
leselisten_titles = leselisten_titles[~leselisten_titles['Autor'].str.startswith('$', na = False)]

In [14]:
leselisten_authors = pd.DataFrame()
leselisten_authors.index = leselisten_titles.drop_duplicates(subset='GND').sort_values(by='GND')['GND']
leselisten_authors['Leselisten_Autor'] = leselisten_titles.drop_duplicates(subset='GND').sort_values(by='GND')['Autor'].tolist()
leselisten_authors[['Leselisten_'+x for x in leselisten]] = leselisten_titles.sort_values(by='GND').groupby('GND')[leselisten].sum()

In [15]:
leselisten_authors.loc[test_links][[
    'Leselisten_Autor',
    'Leselisten_Göttingen', 'Leselisten_Würzburg1',
]]

Unnamed: 0_level_0,Leselisten_Autor,Leselisten_Göttingen,Leselisten_Würzburg1
GND,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://d-nb.info/gnd/118500856,Aischylos,0.0,3.0
https://d-nb.info/gnd/118501232,"Aichinger, Ilse",0.0,0.0
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",1.0,2.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",2.0,3.0
https://d-nb.info/gnd/118519859,"Celan, Paul",0.0,3.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",1.0,1.0


In [16]:
leselisten_authors.shape

(180, 43)

# Schule

In [17]:
schule_titles = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/02_schule.csv", sep=";")

In [18]:
schule_titles = schule_titles.query("GND.notna()")

In [19]:
schule_titles.head()

Unnamed: 0,ID,Bundesland,Nachname,GND,Geschlecht,Titel,Erscheinungsjahr,Erscheinungsjahr_k,Lit.preis_Name,Lit.preis_Autor_in_Name,Jug.literatur
0,2508,Baden-Württemberg,Äsop,https://d-nb.info/gnd/118647180,männlich,Fabeln,unbekannt,unbekannt,,,nein
1,2447,Sachsen-Anhalt,Äsop,https://d-nb.info/gnd/118647180,männlich,Fabeln,unbekannt,unbekannt,,,nein
2,1778,Baden-Württemberg,Achebe,https://d-nb.info/gnd/118646680,männlich,Okonkwo oder Das Alte stürzt,1959,1958,,,nein
3,297,Hessen,Ade,https://d-nb.info/gnd/138053669,weiblich,Toni Erdmann (Film),2016,2017,"(mehrere Filmpreise, siehe Wikipedia)",,ja
4,3,Hamburg,Ade,https://d-nb.info/gnd/138053669,weiblich,Toni Erdmann (Film),2016,2016,"(mehrere Filmpreise, siehe Wikipedia)",,ja


In [20]:
schule_authors = pd.DataFrame()

for gnd_link in schule_titles['GND'].unique():
  meta_author = schule_titles.query("GND == @gnd_link")
  schule_authors.loc[gnd_link, 'Schule_Autor'] = meta_author['Nachname'].tolist()[0]
  for bundesland in meta_author['Bundesland']:
    meta_author_bundesland = meta_author.query("Bundesland == @bundesland")
    schule_authors.loc[gnd_link, 'Schule_'+bundesland] = meta_author_bundesland.shape[0]

schule_authors = schule_authors.fillna(0)

In [21]:
schule_authors.loc[test_links][[
    'Schule_Autor',
    'Schule_Niedersachsen', 'Schule_Bayern',
]]

Unnamed: 0,Schule_Autor,Schule_Niedersachsen,Schule_Bayern
https://d-nb.info/gnd/118500856,Aischylos,0.0,0.0
https://d-nb.info/gnd/118501232,Aichinger,1.0,0.0
https://d-nb.info/gnd/118505602,Bachmann,2.0,1.0
https://d-nb.info/gnd/118516906,Büchner,4.0,1.0
https://d-nb.info/gnd/118519859,Celan,1.0,1.0
https://d-nb.info/gnd/118527908,Dürrenmatt,5.0,6.0


In [22]:
schule_authors.shape

(128, 13)

# Merge Leselisten + Schule

In [23]:
all_authors = leselisten_authors.join(schule_authors, how='outer')

In [24]:
fill_columns = [x for x in all_authors.columns if '_Autor' not in x]
all_authors[fill_columns] = all_authors[fill_columns].fillna(0)

In [25]:
all_authors.shape

(247, 56)

In [26]:
all_authors.query("Leselisten_Autor.notna() and Schule_Autor.notna()").shape

(61, 56)

In [27]:
all_authors.loc[test_links][[
    'Leselisten_Autor', 'Schule_Autor',
    'Leselisten_Göttingen', 'Leselisten_Würzburg1',
    'Schule_Niedersachsen', 'Schule_Bayern',
]]

Unnamed: 0,Leselisten_Autor,Schule_Autor,Leselisten_Göttingen,Leselisten_Würzburg1,Schule_Niedersachsen,Schule_Bayern
https://d-nb.info/gnd/118500856,Aischylos,Aischylos,0.0,3.0,0.0,0.0
https://d-nb.info/gnd/118501232,"Aichinger, Ilse",Aichinger,0.0,0.0,1.0,0.0
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",Bachmann,1.0,2.0,2.0,1.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",Büchner,2.0,3.0,4.0,1.0
https://d-nb.info/gnd/118519859,"Celan, Paul",Celan,0.0,3.0,1.0,1.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",Dürrenmatt,1.0,1.0,5.0,6.0


# GND

### Scrape

In [28]:
import requests
from bs4 import BeautifulSoup
import re

In [29]:
def get_response_from_gnd (gnd_url):
    return requests.get(gnd_url)

In [30]:
def get_name_from_gnd_response (response):
    preferred_name_start = response.text.find('gndo:preferredNameEntityForThePerson')
    preferred_name_slice = response.text[preferred_name_start:preferred_name_start+300]

    forename = ''
    surname = ''

    forename_start = preferred_name_slice.find("gndo:forename")
    if forename_start != -1:
      forename_slice = preferred_name_slice[(forename_start + 15):(forename_start + 100)]
      forename = re.findall(""".+?(?=\")""", forename_slice)[0]

    prefix_start = preferred_name_slice.find("gndo:prefix")
    if prefix_start != -1:
      prefix_slice = preferred_name_slice[(prefix_start + 13):(prefix_start + 100)]
      prefix = re.findall(""".+?(?=\")""", prefix_slice)[0]
      forename = forename + ' ' + prefix

    surname_start = preferred_name_slice.find("gndo:surname")
    if surname_start != -1:
      surname_slice = preferred_name_slice[(surname_start + 14):(surname_start + 100)]
      surname = re.findall(""".+?(?=\")""", surname_slice)[0]

    personalname_start = preferred_name_slice.find("gndo:personalName")
    if personalname_start != -1:
      personalname_slice = preferred_name_slice[(personalname_start + 19):(personalname_start + 100)]
      personalname = re.findall(""".+?(?=\")""", personalname_slice)[0]
      surname = personalname

    nameaddition_start = preferred_name_slice.find("gndo:nameAddition")
    if nameaddition_start != -1:
      nameaddition_slice = preferred_name_slice[(nameaddition_start + 19):(nameaddition_start + 100)]
      nameaddition = re.findall(""".+?(?=\")""", nameaddition_slice)[0]
      surname = surname + ' ' + nameaddition

    return [forename, surname]

In [31]:
def get_gender_from_gnd_response (response):
    gender = float('nan')

    gender_pos_start = response.text.find('vocab/gnd/gender#')
    gender_content = response.text[gender_pos_start+17:gender_pos_start+23]

    if 'female' in gender_content:
        gender = 'female'
    elif 'male' in gender_content:
        gender = 'male'
    else:
        gender = gender_content

    return gender

In [32]:
def get_lifetime_from_gnd_response (response):
    lifetime = [float('nan'), float('nan')]

    birth_pos = response.text.find('gndo:dateOfBirth ')
    if birth_pos != -1:
        birth_content = response.text[birth_pos+18:birth_pos+22]
        if birth_content.isnumeric():
            lifetime[0] = int(birth_content)

    death_pos = response.text.find('gndo:dateOfDeath ')
    if death_pos != -1:
        death_content = response.text[death_pos+18:death_pos+22]
        if death_content.isnumeric():
            lifetime[1] = int(death_content)

    return lifetime

In [33]:
def get_countries_from_gnd_response (response):
    countries = float('nan')

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Land':
            countries = td_elements[i+1].string.strip()
            countries = countries.split('; ')

    return countries

In [34]:
def get_occupations_from_gnd_response (response):
    occupations = float('nan')

    soup = BeautifulSoup(response.text, "html.parser")
    td_elements = soup.findAll('td')

    for i, td_element in enumerate(td_elements):
        if td_element.strong != None and td_element.strong.string == 'Beruf(e)':
            occupation_links = td_elements[i+1].findAll('a')
            occupations = [x.string for x in occupation_links]

            if len(occupations) == 0: # gibt Rubrik Beruf(e), aber dort keine Links, sondern plain text
                occupations = [td_elements[i+1].string.strip()]

    return occupations

In [35]:
gnd = pd.DataFrame()

for gnd_link in tqdm(all_authors.index):
  # response_a = get_response_from_gnd(gnd_url)
  response_b = get_response_from_gnd(gnd_link + '/about/lds')

  forename, surname = get_name_from_gnd_response(response_b)
  if surname == '':
    print(f"{gnd_link} Fehler (Name)")
    author_name = float('NaN')
  elif forename != '':
    author_name = surname + ', ' + forename
  else:
    author_name = surname
  gnd.at[gnd_link, 'GND_Autor'] = author_name

  gnd.at[gnd_link, 'GND_Gender'] = get_gender_from_gnd_response(response_b)
  gnd.at[gnd_link, 'GND_Geburtsjahr'] = get_lifetime_from_gnd_response(response_b)[0]
  gnd.at[gnd_link, 'GND_Sterbejahr'] = get_lifetime_from_gnd_response(response_b)[1]

  0%|          | 0/247 [00:00<?, ?it/s]

In [36]:
gnd = gnd.sort_values(by = 'GND_Autor')

gnd['GND_Autor'] = [re.sub('ä', 'ä', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('á', 'á', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('Č', 'Č', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ć', 'ć', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('é', 'é', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ë', 'ë', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('Ō', 'Ō', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('Ö', 'Ö', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ö', 'ö', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ó', 'ó', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ō', 'ō', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ü', 'ü', x) if pd.notna(x) else x for x in gnd['GND_Autor']]
gnd['GND_Autor'] = [re.sub('ž', 'z', x) if pd.notna(x) else x for x in gnd['GND_Autor']]

In [37]:
gnd.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv")

### Import and Merge

In [38]:
gnd = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/00_gnd.csv", index_col = [0])

In [39]:
for this_index in all_authors.index:
  if this_index in gnd.index:
    all_authors.at[this_index, 'GND_Autor'] = gnd.at[this_index, 'GND_Autor']
    all_authors.at[this_index, 'GND_Gender'] = gnd.at[this_index, 'GND_Gender']
    all_authors.at[this_index, 'GND_Geburtsjahr'] = gnd.at[this_index, 'GND_Geburtsjahr']
    all_authors.at[this_index, 'GND_Sterbejahr'] = gnd.at[this_index, 'GND_Sterbejahr']

In [40]:
all_authors.loc[test_links][[
    'Leselisten_Autor', 'Schule_Autor', 'GND_Autor'
]]

Unnamed: 0,Leselisten_Autor,Schule_Autor,GND_Autor
https://d-nb.info/gnd/118500856,Aischylos,Aischylos,Aeschylus
https://d-nb.info/gnd/118501232,"Aichinger, Ilse",Aichinger,"Aichinger, Ilse"
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",Bachmann,"Bachmann, Ingeborg"
https://d-nb.info/gnd/118516906,"Büchner, Georg",Büchner,"Büchner, Georg"
https://d-nb.info/gnd/118519859,"Celan, Paul",Celan,"Celan, Paul"
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",Dürrenmatt,"Dürrenmatt, Friedrich"


In [41]:
# Gibt es Autor:innen mit unterschiedlichen GND-Links, die den gleichen GND-Namen haben?
all_authors['GND_Autor'].value_counts()[all_authors['GND_Autor'].value_counts() > 1]

Series([], Name: count, dtype: int64)

# Wikipedia

### Scrape

In [42]:
# !pip3 install tools
# !pip install pywikibot

In [43]:
# Code (leicht angepasst) nach: https://github.com/temporal-communities/wiki-metrix

# Illmer, V. J., Soethaert, B., Welz, L., Fischer, F., & Jäschke, R. (2024, Februar 21).
# Literatur im Wikiversum – Eine praktische Annäherung über API-Abfragen und Wikipedia-Metriken.
# DHd 2024 Quo Vadis DH (DHd2024), Passau, Deutschland. https://doi.org/10.5281/zenodo.10698426

pywikibot_config = r"""# -*- coding: utf-8  -*-


mylang = 'de'
family = 'wikipedia'
usernames['wikipedia']['de'] = 'test'"""

with open('user-config.py', 'w', encoding="utf-8") as f:
    f.write(pywikibot_config)

import pywikibot
import requests
import datetime
import urllib.parse

def get_page_stats(page: pywikibot.Page):
    """
    Get page stats for a given page.
    """

    # Handle redirects
    # page = handle_redirect(page)

    page_content = page.get(force=True)
    length_in_bytes = len(page_content.encode("utf-8"))
    page_revisions = list(page.revisions(reverse=True))

    data = {
        "title": page.title(),
        "url": page.full_url(),
        "length": length_in_bytes,
        "n_contributors": len(page.contributors()),
        "n_revisions": len(page_revisions),
        "n_extlinks": len(list(page.extlinks())),
        "n_langlinks": len(page.langlinks()),
        "n_links": len(list(page.linkedPages())),
        "n_linkshere": len(
            list(page.linkedPages(namespaces=[0], follow_redirects=False))
        ),  # Article namespace only (0)
        "n_categories": len(list(page.categories())),
        "pageviews_365d": get_pageviews(page, days=365),
        "pageviews_730d": get_pageviews(page, days=730),
        "pageviews_1825d": get_pageviews(page, days=1825),
        "first_revision": page_revisions[0].timestamp,
    }

    MW_API_LIMIT = 500
    # Give warning if any value is at the limit
    for key, value in data.items():
        if value == MW_API_LIMIT:
            print(f"Warning: {key} at limit {MW_API_LIMIT}.")

    return data

# Use Wikimedia Pageviews REST API to get pageviews
def get_pageviews(page: pywikibot.Page, days=365):
    lang = page.site.code
    site = page.site.family.name

    # Wikimedia REST API
    # https://wikitech.wikimedia.org/wiki/Analytics/AQS/Pageviews
    # https://wikimedia.org/api/rest_v1/
    end_date = datetime.date.today() - datetime.timedelta(days=2)  # Two days ago
    start_date = end_date - datetime.timedelta(days=days)  # Two days minus [days] ago

    agent_type = "user"  # user, bot, spider, all-agents
    title_uri = urllib.parse.quote(
        page.title(underscore=True, with_section=False), safe=""
    )  # URI-encoded title, no safe characters
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{lang}.{site}/all-access/{agent_type}/{title_uri}/monthly/{start_date.strftime('%Y%m%d')}/{end_date.strftime('%Y%m%d')}"

    user_agent = f"wiki-metrix (https://github.com/temporal-communities/wiki-metrix) requests/{requests.__version__}"
    response = requests.get(url, headers={"User-Agent": user_agent})

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code} {response.reason}")

    data = response.json()
    pageviews_sum = sum(filter(None, [item["views"] for item in data["items"]]))

    return pageviews_sum

def add_page_stats_to_df (df, wikipedia_article_column = 'Wikipedia_Artikel'):
    site = pywikibot.Site('de', 'wikipedia')  # The site we want to run our bot on
    wikiresults_joined_df = pd.DataFrame()

    for article in tqdm(df[wikipedia_article_column]):
        page = pywikibot.Page(site, article)
        wikiresults = get_page_stats(page)

        wikiresults_df = pd.DataFrame(pd.Series(wikiresults)).T
        wikiresults_joined_df = pd.concat([wikiresults_joined_df, wikiresults_df])

    wikiresults_joined_df = wikiresults_joined_df.reset_index(drop=True)
    df = df.join(wikiresults_joined_df)

    return df

In [44]:
site = pywikibot.Site('de', 'wikipedia')

In [45]:
wiki = pd.DataFrame()

for gnd_link in tqdm(all_authors.sort_values(by='GND_Autor').index):

  author = all_authors.loc[gnd_link, 'GND_Autor']
  if pd.isna(author):
    author = '(nan) XXX'
  author_reversed = ' '.join(str(author).split(', ')[::-1])

  page = pywikibot.Page(site, author_reversed)

  try:
    wiki_author = get_page_stats(page)
    wiki_author['GND_Autor'] = author
    wiki_author['wiki_status'] = 'page_found'
    wiki_author = pd.DataFrame(pd.Series(wiki_author)).T
    wiki_author.index = [gnd_link]
  except:
    wiki_author = pd.DataFrame(index = [gnd_link])
    wiki_author['GND_Autor'] = author
    wiki_author['wiki_status'] = 'page_not_found'
    wiki_author['pageviews_365d'] = 0

  if wiki_author['wiki_status'].tolist()[0] == 'page_not_found':
    print(f"{gnd_link} {author} (Seite nicht gefunden)")
  else:
    print(f"{gnd_link} {author} ({wiki_author['pageviews_365d'].tolist()[0]} pageviews)")

  wiki = pd.concat([wiki, wiki_author])

  0%|          | 0/247 [00:00<?, ?it/s]

  link._site = pywikibot.Site(lang, source.family.name)


https://d-nb.info/gnd/118646680 Achebe, Chinua (9578 pageviews)
https://d-nb.info/gnd/119369125 Achleitner, Friedrich (3609 pageviews)
https://d-nb.info/gnd/118500422 Achternbusch, Herbert (14083 pageviews)
https://d-nb.info/gnd/138053669 Ade, Maren (26956 pageviews)
https://d-nb.info/gnd/118500775 Adorno, Theodor W. (168277 pageviews)
https://d-nb.info/gnd/118500856 Aeschylus (Seite nicht gefunden)
https://d-nb.info/gnd/118647180 Aesopus (265 pageviews)
https://d-nb.info/gnd/119293439 Agamben, Giorgio (17610 pageviews)
https://d-nb.info/gnd/118501232 Aichinger, Ilse (31821 pageviews)
https://d-nb.info/gnd/118501259 Ajtmatov, Čingiz (Seite nicht gefunden)
https://d-nb.info/gnd/118501380 Albee, Edward (5754 pageviews)
https://d-nb.info/gnd/119604108 Alciato, Andreas (Seite nicht gefunden)


  link._site = pywikibot.Site(lang, source.family.name)


https://d-nb.info/gnd/1256774294 Alighieri, Dante (164514 pageviews)
https://d-nb.info/gnd/118869159 Allende, Isabel (93330 pageviews)
https://d-nb.info/gnd/123526183 Almond, David (998 pageviews)
https://d-nb.info/gnd/118502255 Altenberg, Peter (19032 pageviews)
https://d-nb.info/gnd/115674861 Altenburg, Matthias (5154 pageviews)
https://d-nb.info/gnd/118937197 Amery, Carl (9443 pageviews)
https://d-nb.info/gnd/118502786 Andersch, Alfred (29942 pageviews)
https://d-nb.info/gnd/118502794 Andersen, Hans Christian (184724 pageviews)
https://d-nb.info/gnd/119111365 Anderson, Sascha (27080 pageviews)
https://d-nb.info/gnd/118502921 Andreas-Salomé, Lou (74414 pageviews)
https://d-nb.info/gnd/118502956 Andres, Stefan (8736 pageviews)
https://d-nb.info/gnd/118503111 Angelus Silesius (23309 pageviews)
https://d-nb.info/gnd/120315106 Ani, Friedrich (13507 pageviews)
https://d-nb.info/gnd/118503251 Anouilh, Jean (9157 pageviews)
https://d-nb.info/gnd/11850357X Anzengruber, Ludwig (10239 pageview

In [46]:
wiki = wiki.sort_values(by = 'GND_Autor')

In [47]:
wiki.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv")

### Import and Merge

In [48]:
wiki = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/04_wiki.csv", index_col = [0])

In [49]:
for this_index in all_authors.index:
  if this_index in wiki.index:
    all_authors.at[this_index, 'Wiki_Autor'] = wiki.at[this_index, 'title']
    all_authors.at[this_index, 'Wiki_Summe'] = wiki.at[this_index, 'pageviews_365d']

In [50]:
all_authors.loc[test_links][[
    'Leselisten_Autor', 'Wiki_Autor',
    'Wiki_Summe'
]]

Unnamed: 0,Leselisten_Autor,Wiki_Autor,Wiki_Summe
https://d-nb.info/gnd/118500856,Aischylos,,0.0
https://d-nb.info/gnd/118501232,"Aichinger, Ilse",Ilse Aichinger,31821.0
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",Ingeborg Bachmann,394707.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",Georg Büchner,194957.0
https://d-nb.info/gnd/118519859,"Celan, Paul",Paul Celan,93287.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",Friedrich Dürrenmatt,183613.0


# BDSL

### Scrape

In [51]:
# !pip3 install google_colab_selenium
import google_colab_selenium as gs
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

In [52]:
driver = gs.Chrome()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [53]:
# log in via SUB Göttingen
driver.get("http://han.sub.uni-goettingen.de/han/BDSL")

username = driver.find_element(By.ID, "plainuser")
username.send_keys("000721344262")

password = driver.find_element(By.ID, "password")
password.send_keys("karen1na")

password.send_keys(Keys.RETURN)

In [54]:
bdsl = pd.DataFrame()

for gnd_link in tqdm(all_authors.sort_values(by='GND_Autor').index):
  author = all_authors.loc[gnd_link, 'GND_Autor']
  if pd.isna(author):
    author = '(nan) XXX'

  # go to search
  driver.find_element(By.LINK_TEXT, "Suche").click()

  # change dropdown to 'Behandelte Person'
  dropdown_menu = driver.find_element(By.NAME, "DD1")
  select = Select(dropdown_menu)
  select.select_by_value("4")

  # enter author name
  input_field = driver.find_element(By.NAME, "SF1")
  input_field.send_keys(author)

  # search for author
  input_field.send_keys(Keys.RETURN)

  # get number of results
  page_source = driver.page_source
  hits_str = re.findall("\d* Titel gefunden", page_source)

  if len(hits_str) == 0 or hits_str[0].startswith(' Titel'):
    hits_int = 0
  else:
    hits_int = int(hits_str[0].split(" ")[0])

  print(f"{gnd_link} {author} ({hits_int} Treffer)")
  bdsl.at[gnd_link, 'GND_Autor'] = author
  bdsl.at[gnd_link, 'BDSL_Summe'] = hits_int

bdsl['BDSL_Summe'] = bdsl['BDSL_Summe'].fillna(0)

  0%|          | 0/247 [00:00<?, ?it/s]

https://d-nb.info/gnd/118646680 Achebe, Chinua (0 Treffer)
https://d-nb.info/gnd/119369125 Achleitner, Friedrich (31 Treffer)
https://d-nb.info/gnd/118500422 Achternbusch, Herbert (75 Treffer)
https://d-nb.info/gnd/138053669 Ade, Maren (0 Treffer)
https://d-nb.info/gnd/118500775 Adorno, Theodor W. (1282 Treffer)
https://d-nb.info/gnd/118500856 Aeschylus (0 Treffer)
https://d-nb.info/gnd/118647180 Aesopus (0 Treffer)
https://d-nb.info/gnd/119293439 Agamben, Giorgio (0 Treffer)
https://d-nb.info/gnd/118501232 Aichinger, Ilse (537 Treffer)
https://d-nb.info/gnd/118501259 Ajtmatov, Čingiz (0 Treffer)
https://d-nb.info/gnd/118501380 Albee, Edward (0 Treffer)
https://d-nb.info/gnd/119604108 Alciato, Andreas (0 Treffer)
https://d-nb.info/gnd/1256774294 Alighieri, Dante (0 Treffer)
https://d-nb.info/gnd/118869159 Allende, Isabel (0 Treffer)
https://d-nb.info/gnd/123526183 Almond, David (0 Treffer)
https://d-nb.info/gnd/118502255 Altenberg, Peter (154 Treffer)
https://d-nb.info/gnd/115674861 Al

In [55]:
bdsl.to_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv")

### Import and Merge

In [56]:
bdsl = pd.read_csv("/content/drive/MyDrive/2024.Kanonizität/resources/03_bdsl.csv", index_col = [0])

In [57]:
for this_index in all_authors.index:
  if this_index in bdsl.index:
    all_authors.at[this_index, 'BDSL_Summe'] = bdsl.at[this_index, 'BDSL_Summe']

In [58]:
all_authors.loc[test_links][[
    'Leselisten_Autor', 'Schule_Autor',
    'BDSL_Summe'
]]

Unnamed: 0,Leselisten_Autor,Schule_Autor,BDSL_Summe
https://d-nb.info/gnd/118500856,Aischylos,Aischylos,0.0
https://d-nb.info/gnd/118501232,"Aichinger, Ilse",Aichinger,537.0
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",Bachmann,2218.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",Büchner,1775.0
https://d-nb.info/gnd/118519859,"Celan, Paul",Celan,2951.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",Dürrenmatt,1075.0


# Metrics

In [59]:
all_authors['Leselisten_Summe'] = all_authors[['Leselisten_'+x for x in leselisten]].sum(axis=1)
all_authors['Leselisten_Anteil'] = all_authors['Leselisten_Summe']/all_authors['Leselisten_Summe'].sum()
all_authors['Leselisten_Rang'] = all_authors['Leselisten_Summe'].rank(ascending=False)

In [60]:
all_authors['Schule_Summe'] = all_authors[['Schule_'+x for x in schule_titles['Bundesland'].unique()]].sum(axis=1)
all_authors['Schule_Anteil'] = all_authors['Schule_Summe']/all_authors['Schule_Summe'].sum()
all_authors['Schule_Rang'] = all_authors['Schule_Summe'].rank(ascending=False)

In [61]:
all_authors['BDSL_Anteil'] = all_authors['BDSL_Summe']/all_authors['BDSL_Summe'].sum()
all_authors['BDSL_Rang'] = all_authors['BDSL_Summe'].rank(ascending=False)

In [62]:
all_authors['Wiki_Anteil'] = all_authors['Wiki_Summe']/all_authors['Wiki_Summe'].sum()
all_authors['Wiki_Rang'] = all_authors['Wiki_Summe'].rank(ascending=False)

# Check

In [63]:
all_authors.head()

Unnamed: 0,Leselisten_Autor,Leselisten_Aachen2,Leselisten_Aachen1,Leselisten_Augsburg,Leselisten_FU Berlin,Leselisten_Bochum,Leselisten_Braunschweig,Leselisten_Dortmund,Leselisten_Mannheim,Leselisten_Eichstätt-Ingolstadt,...,Leselisten_Summe,Leselisten_Anteil,Leselisten_Rang,Schule_Summe,Schule_Anteil,Schule_Rang,BDSL_Anteil,BDSL_Rang,Wiki_Anteil,Wiki_Rang
https://d-nb.info/gnd/1016386486,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,214.0,1.0,0.002119,100.5,0.000103,156.5,0.001322,126.0
https://d-nb.info/gnd/108082652,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,214.0,2.0,0.004237,56.0,0.0,204.5,6.5e-05,214.0
https://d-nb.info/gnd/1095775561,"Barthel, Kurt",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.000716,138.5,0.0,0.0,188.0,0.0,204.5,0.000626,163.0
https://d-nb.info/gnd/111492167X,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,214.0,1.0,0.002119,100.5,0.0,204.5,0.0,236.0
https://d-nb.info/gnd/115371915,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,214.0,2.0,0.004237,56.0,0.0,204.5,0.000877,141.0


In [64]:
all_authors.loc[test_links][[
    'GND_Autor',
    'Leselisten_Rang', 'Schule_Rang', 'BDSL_Rang', 'Wiki_Rang'
]]

Unnamed: 0,GND_Autor,Leselisten_Rang,Schule_Rang,BDSL_Rang,Wiki_Rang
https://d-nb.info/gnd/118500856,Aeschylus,29.0,100.5,204.5,236.0
https://d-nb.info/gnd/118501232,"Aichinger, Ilse",15.0,100.5,20.0,67.0
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",3.0,5.0,4.0,1.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",2.0,3.0,6.0,8.0
https://d-nb.info/gnd/118519859,"Celan, Paul",6.0,19.0,3.0,32.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",7.0,2.0,13.0,11.0


# Ergebnisse

In [65]:
all_authors.sort_values(by='Leselisten_Rang').head(20)[[
    'GND_Autor',
    'Leselisten_Rang', 'Schule_Rang', 'BDSL_Rang', 'Wiki_Rang'
]]

Unnamed: 0,GND_Autor,Leselisten_Rang,Schule_Rang,BDSL_Rang,Wiki_Rang
https://d-nb.info/gnd/118514768,"Brecht, Bertolt",1.0,1.0,1.0,2.0
https://d-nb.info/gnd/118516906,"Büchner, Georg",2.0,3.0,6.0,8.0
https://d-nb.info/gnd/118505602,"Bachmann, Ingeborg",3.0,5.0,4.0,1.0
https://d-nb.info/gnd/118509047,"Benn, Gottfried",4.0,8.5,9.0,34.0
https://d-nb.info/gnd/118509861,"Bernhard, Thomas",5.0,13.0,5.0,26.0
https://d-nb.info/gnd/118519859,"Celan, Paul",6.0,19.0,3.0,32.0
https://d-nb.info/gnd/118527908,"Dürrenmatt, Friedrich",7.0,2.0,13.0,11.0
https://d-nb.info/gnd/118527533,"Droste-Hülshoff, Annette von",8.0,188.0,14.0,15.0
https://d-nb.info/gnd/118526200,"Döblin, Alfred",9.0,12.0,7.0,45.0
https://d-nb.info/gnd/118512676,"Böll, Heinrich",10.0,4.0,11.0,18.0


In [66]:
px.bar(
    all_authors.sort_values(by='Leselisten_Anteil').tail(10),
    x = 'GND_Autor',
    y = ['Leselisten_Anteil', 'Schule_Anteil', 'BDSL_Anteil', 'Wiki_Anteil'],
    barmode='group'
)

In [67]:
all_authors[[
    'Leselisten_Summe', 'Schule_Summe', 'BDSL_Summe', 'Wiki_Summe',
    ]].corr(method='pearson')

Unnamed: 0,Leselisten_Summe,Schule_Summe,BDSL_Summe,Wiki_Summe
Leselisten_Summe,1.0,0.832602,0.863487,0.555381
Schule_Summe,0.832602,1.0,0.688283,0.538562
BDSL_Summe,0.863487,0.688283,1.0,0.530032
Wiki_Summe,0.555381,0.538562,0.530032,1.0
