In [1]:
import pandas as pd
import glob
import re

In [2]:
files = glob.glob("../raw_data/*")

# indicators

In [3]:
# when multiple gnd_ids appear in one line (connected with '+'), split them in multiple lines
def expand_multiple_ids(df, column='GND'):
    new_rows_list = []
    rows_to_drop = []

    rows_with_plus = df[df[column].str.contains(r'\+', na=False)]

    for index, row in rows_with_plus.iterrows():
        gnd_ids = row[column].split(' + ')
        rows_to_drop.append(index)
        for gnd_id in gnd_ids:
            new_row = row.copy()
            new_row[column] = gnd_id
            new_rows_list.append(new_row)

    if new_rows_list:
        new_rows_df = pd.DataFrame(new_rows_list)
        df = df.drop(rows_to_drop)
        df = pd.concat([df, new_rows_df], ignore_index=True)
    
    return df

## unilist

In [4]:
unilist_files = [x for x in files if 'unilist' in x]

In [5]:
# prepare individual unilists
unilists = {}

for this_file in unilist_files:
    unilist_name = this_file.split("/")[-1].split(".")[0]
    unilist_df = pd.read_csv(this_file)
    unilist_df = expand_multiple_ids(unilist_df)
    unilist_df = unilist_df[~unilist_df['author'].str.startswith('$', na = False)] # filter editors
    unilist_df = unilist_df.groupby(['GND'], as_index=False)['prob_to_read'].sum()
    unilist_df = unilist_df.rename(columns={'prob_to_read' : 'prob_to_read_sum_'+unilist_name})
    unilist_df['prob_to_read_share_'+unilist_name] = unilist_df['prob_to_read_sum_'+unilist_name]/unilist_df['prob_to_read_sum_'+unilist_name].sum()
    
    unilists[unilist_name] = unilist_df

In [6]:
# merge unilists
unilist_iter = iter(unilists.items())
first_name, unilist_merged = next(unilist_iter)

for unilist_name, unilist_df in unilist_iter:
    unilist_merged = pd.merge(
        unilist_merged,
        unilist_df,
        on='GND',
        how='outer',
    )
unilist_merged = unilist_merged.fillna(0).reset_index(drop=True)

In [7]:
unilist_merged.sort_values(by='prob_to_read_sum_unilist_goettingen', ascending=False)

Unnamed: 0,GND,prob_to_read_sum_unilist_heidelberg_2017,prob_to_read_share_unilist_heidelberg_2017,prob_to_read_sum_unilist_aachen_201718,prob_to_read_share_unilist_aachen_201718,prob_to_read_sum_unilist_goettingen,prob_to_read_share_unilist_goettingen,prob_to_read_sum_unilist_luxemburg,prob_to_read_share_unilist_luxemburg,prob_to_read_sum_unilist_graz_2021,...,prob_to_read_sum_unilist_innsbruck,prob_to_read_share_unilist_innsbruck,prob_to_read_sum_unilist_osnabrueck,prob_to_read_share_unilist_osnabrueck,prob_to_read_sum_unilist_la_aachen_2018,prob_to_read_share_unilist_la_aachen_2018,prob_to_read_sum_unilist_fu_berlin,prob_to_read_share_unilist_fu_berlin,prob_to_read_sum_unilist_zuerich_2013,prob_to_read_share_unilist_zuerich_2013
200,118540238,5.727273,0.086515,2.492755,0.040582,6.000000,0.093750,7.0,0.097222,4.0,...,3.0,0.081081,2.500000,0.051282,1.500000,0.060549,13.0,0.050000,9.0,0.037190
434,118607626,3.636364,0.054930,2.216484,0.036084,3.333333,0.052083,3.0,0.041667,2.0,...,3.0,0.081081,1.250000,0.025641,0.845000,0.034109,10.0,0.038462,6.0,0.024793
304,118563076,3.764706,0.056869,0.663836,0.010807,3.000000,0.046875,2.0,0.027778,1.0,...,1.0,0.027027,1.250000,0.025641,0.819000,0.033060,7.0,0.026923,4.0,0.016529
356,118577166,1.194805,0.018048,0.176000,0.002865,3.000000,0.046875,1.0,0.013889,1.0,...,1.0,0.027027,0.833333,0.017094,0.076923,0.003105,6.0,0.023077,5.0,0.020661
890,12989432X,1.000000,0.015106,0.213287,0.003472,2.000000,0.031250,2.0,0.027778,0.0,...,0.0,0.000000,0.000000,0.000000,0.292000,0.011787,2.0,0.007692,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
908,138119813,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
909,14009931X,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
910,140917462,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.013889,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
911,141869674,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000


## litges

In [8]:
brenner = pd.read_csv("../raw_data/litges_brenner.csv", sep=";", index_col=[0])
brenner = brenner[['GND-Nummer', 'Anzahl_Seiten_neu']]
brenner = brenner.rename(columns={'GND-Nummer' : 'GND', 'Anzahl_Seiten_neu' : 'page_count'})
brenner['page_count_rel'] = brenner['page_count']/369
brenner = brenner.query("GND.notna()")

In [9]:
beutin = pd.read_csv("../raw_data/litges_beutin.csv", sep=";", index_col=[0])
beutin = beutin.query("ignore.isna()")
beutin = beutin[['GND-Nummer', 'Anzahl_Seiten_neu']]
beutin = beutin.rename(columns={'GND-Nummer' : 'GND', 'Anzahl_Seiten_neu' : 'page_count'})
beutin['page_count_rel'] = beutin['page_count']/758
beutin = beutin.query("GND.notna()")

In [10]:
litges_merged = pd.merge(
        beutin,
        brenner,
        on='GND',
        how='outer',
        suffixes=['_litges_beutin', '_litges_brenner']
    )
litges_merged = litges_merged.fillna(0).reset_index(drop=True)

In [11]:
litges_merged.sort_values(by='page_count_litges_brenner', ascending=False)

Unnamed: 0,GND,page_count_litges_beutin,page_count_rel_litges_beutin,page_count_litges_brenner,page_count_rel_litges_brenner
455,118540238,62.0,0.081794,27.0,0.073171
731,118577166,27.0,0.035620,24.0,0.065041
275,118514768,98.0,0.129288,19.0,0.051491
918,118607626,52.0,0.068602,18.0,0.048780
1083,118632477,11.0,0.014512,15.0,0.040650
...,...,...,...,...,...
1803,13213523X,1.0,0.001319,0.0,0.000000
1804,132139901,1.0,0.001319,0.0,0.000000
1808,132721392,1.0,0.001319,0.0,0.000000
1809,135697042,1.0,0.001319,0.0,0.000000


## vv

In [12]:
vv_mainz_raw = pd.read_csv("../raw_data/vv_mainz.csv", sep=";", index_col=[0])
vv_mainz_event_count = 213 # vv_mainz_raw['ID'].nunique()
vv_mainz_raw = vv_mainz_raw.query("wrong_mention.isna() and GND_ID != 'no_ID' and GND_ID.notna()")

vv_mainz = pd.DataFrame()
for gnd_id in vv_mainz_raw['GND_ID'].unique():
    this_id_event_count = vv_mainz_raw.query("GND_ID==@gnd_id")['ID'].nunique()
    vv_mainz.at[gnd_id, 'event_count_vv_mainz'] = this_id_event_count
    vv_mainz.at[gnd_id, 'event_count_rel_vv_mainz'] = this_id_event_count/vv_mainz_event_count
vv_mainz = vv_mainz.reset_index(names='GND')  

In [13]:
vv_stuttgart_raw = pd.read_csv("../raw_data/vv_stuttgart.csv", sep=";", index_col=[0])
vv_stuttgart_event_count = 491 # vv_stuttgart_raw['ID'].nunique()
vv_stuttgart_raw = vv_stuttgart_raw.query("wrong_mention.isna() and GND_ID != 'no_ID' and GND_ID.notna()")

vv_stuttgart = pd.DataFrame()
for gnd_id in vv_stuttgart_raw['GND_ID'].unique():
    this_id_event_count = vv_stuttgart_raw.query("GND_ID==@gnd_id")['ID'].nunique()
    vv_stuttgart.at[gnd_id, 'event_count_vv_stuttgart'] = this_id_event_count
    vv_stuttgart.at[gnd_id, 'event_count_rel_vv_stuttgart'] = this_id_event_count/vv_stuttgart_event_count
vv_stuttgart = vv_stuttgart.reset_index(names='GND')  

In [14]:
vv_wien_raw = pd.read_csv("../raw_data/vv_wien.csv", sep=";", index_col=[0])
vv_wien_event_count = 203 # vv_wien_raw['ID'].nunique()
vv_wien_raw = vv_wien_raw.query("wrong_mention.isna() and GND_ID != 'no_ID' and GND_ID.notna()")

vv_wien = pd.DataFrame()
for gnd_id in vv_wien_raw['GND_ID'].unique():
    this_id_event_count = vv_wien_raw.query("GND_ID==@gnd_id")['ID'].nunique()
    vv_wien.at[gnd_id, 'event_count_vv_wien'] = this_id_event_count
    vv_wien.at[gnd_id, 'event_count_rel_vv_wien'] = this_id_event_count/vv_wien_event_count
vv_wien = vv_wien.reset_index(names='GND')  

In [15]:
vv_merged = pd.merge(
        vv_mainz,
        vv_stuttgart,
        on='GND',
        how='outer',
    ).merge(
        vv_wien,
        on='GND',
        how='outer',
    )
vv_merged = vv_merged.fillna(0)
vv_merged = vv_merged.query("GND.str.contains('-') == False and GND.str.contains(' ') == False")
vv_merged = vv_merged.reset_index(drop=True)

In [16]:
vv_merged.sort_values(by='event_count_vv_stuttgart', ascending=False)

Unnamed: 0,GND,event_count_vv_mainz,event_count_rel_vv_mainz,event_count_vv_stuttgart,event_count_rel_vv_stuttgart,event_count_vv_wien,event_count_rel_vv_wien
319,118540238,16.0,0.075117,36.0,0.073320,12.0,0.059113
520,118607626,8.0,0.037559,22.0,0.044807,5.0,0.024631
391,118559230,4.0,0.018779,18.0,0.036660,12.0,0.059113
408,118563076,7.0,0.032864,14.0,0.028513,5.0,0.024631
361,118552465,5.0,0.023474,14.0,0.028513,5.0,0.024631
...,...,...,...,...,...,...,...
1015,136500757,0.0,0.000000,0.0,0.000000,2.0,0.009852
581,118627724,0.0,0.000000,0.0,0.000000,1.0,0.004926
582,118628097,2.0,0.009390,0.0,0.000000,0.0,0.000000
585,118629867,7.0,0.032864,0.0,0.000000,0.0,0.000000


## lexika

In [17]:
killy = pd.read_csv("../raw_data/killy.csv", sep=";")
killy = killy[['GND', 'length']]
killy = killy.query("GND.notna() and length.notna()")
killy = killy.rename(columns={'length':'killy_length'})
killy = killy.groupby('GND')['killy_length'].sum().to_frame().reset_index() # handle 116720964

In [18]:
killy.sort_values(by='killy_length', ascending=False)

Unnamed: 0,GND,killy_length
2760,118540238,10344.0
3532,118607626,8717.0
2497,118514768,7449.0
2935,118551981,7248.0
3695,11862220X,7172.0
...,...,...
6774,127465235,175.0
4983,118833340,174.0
589,1043512810,169.0
1831,116922842,165.0


## editions

In [19]:
reclam = pd.read_csv("../raw_data/reclam.csv")
reclam = expand_multiple_ids(reclam, column='GND_author')
reclam = reclam.query("GND_author.notna()")

In [20]:
sachgruppen = reclam['sachgruppen'].tolist()
sachgruppen = [item for sublist in [str(x).split(" + ") for x in sachgruppen] for item in sublist]
pd.Series(sachgruppen).value_counts().index.tolist()

['59 Belletristik',
 'B Belletristik',
 '830 Deutsche Literatur',
 '10 Philosophie',
 '100 Philosophie',
 '63 Geschichte und Historische Hilfswissenschaften',
 '53 Deutsche Sprach- und Literaturwissenschaft',
 'nan',
 'S Schulbücher',
 '820 Englische Literatur',
 '420 Englisch',
 '49 Theater, Tanz, Film',
 '840 Französische Literatur',
 '08a Schöne Literatur',
 '440 Französisch, romanische Sprachen allgemein',
 '48 Musik',
 '780 Musik',
 '870 Lateinische Literatur',
 '12 Christliche Religion',
 '16 Politik',
 '51 Allgemeine und Vergleichende Sprach- und Literaturwissenschaft',
 '56 Klassische Sprach- und Literaturwissenschaft',
 '810 Englische Literatur Amerikas',
 '930 Alte Geschichte, Archäologie',
 '880 Griechische Literatur',
 '891.8 Slawische Literatur',
 '940 Geschichte Europas',
 '320 Politik',
 '230 Theologie, Christentum',
 '300 Sozialwissenschaften, Soziologie, Anthropologie',
 '19 Recht',
 '720 Architektur',
 '23 Schulbücher',
 '01 Wissenschaft und Kultur allgemein',
 '13 Al

In [21]:
# filter already at this point, because we want to calc the appropriate reclam_count
# but do not filter sachgruppen
relevant_sachgruppen = [
    '59 Belletristik', 'B Belletristik', '830 Deutsche Literatur',
    '08a Schöne Literatur', '2303 Belletristik', '800 Literatur, Rhetorik, Literaturwissenschaft',
    'K Kinder- und Jugendliteratur', '08 Comics, Cartoons, Karikaturen',
    '741.5 Comics, Cartoons, Karikaturen'
]

reclam = (
    reclam
    .query("year >= 2000")
    # .loc[reclam['sachgruppen'].str.contains('|'.join(relevant_sachgruppen), na=False)]
)

In [22]:
reclam = reclam['GND_author'].value_counts().to_frame().reset_index()
reclam = reclam.rename(columns={'GND_author' : 'GND', 'count' : 'reclam_count'})

In [23]:
reclam

Unnamed: 0,GND,reclam_count
0,118613723,75
1,118540238,63
2,118607626,51
3,118520814,44
4,128451661,41
...,...,...
1459,131495879,1
1460,124300812,1
1461,115738290,1
1462,120283743,1


## bibliographies

In [24]:
bdsl = pd.read_csv("../raw_data/bdsl.csv", index_col=[0])
bdsl = bdsl.rename(columns={'searchphrase' : 'BDSL_searchphrase'})

In [25]:
bdsl.sort_values(by='BDSL_hits_2000_all', ascending=False).head(200).values

array([['116804521', 'Friedrich Friedrich', 18253.0],
       ['137325355', 'Heinrich', 12137.0],
       ['118540238', 'Johann Wolfgang von Goethe', 10227.0],
       ['1016491166', 'L. A. Paul', 8242.0],
       ['118559230', 'Franz Kafka', 4864.0],
       ['118577166', 'Thomas Mann', 4798.0],
       ['118607626', 'Friedrich Schiller', 3890.0],
       ['118563076', 'Heinrich von Kleist', 3334.0],
       ['118587943', 'Friedrich Nietzsche', 3296.0],
       ['118514768', 'Bertolt Brecht', 3179.0],
       ['118558749', 'C. G. Jung', 3155.0],
       ['118552465', 'E. T. A. Hoffmann', 3016.0],
       ['118509039', 'Walter Benjamin', 2951.0],
       ['1178632180', 'O.F. Berg', 2728.0],
       ['118548018', 'Heinrich Heine', 2460.0],
       ['1112471774', 'Albrecht', 2456.0],
       ['118534262', 'Theodor Fontane', 2380.0],
       ['118601024', 'Rainer Maria Rilke', 2322.0],
       ['118519859', 'Paul Celan', 2287.0],
       ['118551981', 'Friedrich Hölderlin', 2137.0],
       ['118818651', 'Ka

In [26]:
wrong_result_authors={
    '116804521' : 0, # Friedrich Friedrich
    '137325355' : 0, # Heinrich
    '1016491166' : 0, # L. A. Paul
    '118558749' : 0, # C. G. Jung
    '1178632180' : 0, # O.F. Berg
    '1112471774' : 0, # Albrecht
    '118581783' : 0, # E. Y. Meyer
    '120404389' : 0, # P. D. James
    '119403412' : 0, #  E. A. Richter
    '128822724' : 0, # D. B. C. Pierre
    '118534408' : 0, # E. M. Forster
    '119545780' : 0, # F. Zell
    '118934163' : 0, # Bai Li
    '118500708' : 0, # H. G. Adler
    '118841386' : 0, # Wu Sun
    '116067624' : 0, # H. W. Katz
    '102840466' : 0, # Niemand
    '103127178' : 0, # Rost
    '12332243X' : 0, # Grosser, J. F. G.
    '11924246X' : 0, # Said (wrong one)
    '118534424' : 0, # Georg Forster (wrong one)
    '122541332' : 0, # Richard Wagner (wrong one)
}

for gnd_id, true_hits in wrong_result_authors.items():
    index_to_fix = bdsl.query("GND == @gnd_id").index
    bdsl.loc[index_to_fix, 'BDSL_hits_2000_all'] = true_hits

In [27]:
bdsl.sort_values(by='BDSL_hits_2000_all', ascending=False).head(10)

Unnamed: 0,GND,BDSL_searchphrase,BDSL_hits_2000_all
3326,118540238,Johann Wolfgang von Goethe,10227.0
3636,118559230,Franz Kafka,4864.0
3888,118577166,Thomas Mann,4798.0
4296,118607626,Friedrich Schiller,3890.0
3696,118563076,Heinrich von Kleist,3334.0
4050,118587943,Friedrich Nietzsche,3296.0
2954,118514768,Bertolt Brecht,3179.0
3544,118552465,E. T. A. Hoffmann,3016.0
2880,118509039,Walter Benjamin,2951.0
3463,118548018,Heinrich Heine,2460.0


## staatsexamen

In [28]:
staatsexamen = pd.read_csv("../raw_data/staatsexamen.csv", encoding='latin1', sep=";")
staatsexamen = staatsexamen.query("GND.notna()")
staatsexamen['GND'] = staatsexamen['GND'].apply(lambda x : x.split('/')[-1])
staatsexamen = staatsexamen[['GND', 'count_exams']]
staatsexamen = staatsexamen.rename(columns={'count_exams' : 'staatsexamen_count'})
staatsexamen['staatsexamen_count'] = staatsexamen['staatsexamen_count'].apply(lambda x : x.replace(",", ".")).astype(float)
staatsexamen = staatsexamen.groupby('GND')['staatsexamen_count'].sum().to_frame().reset_index() # deal with 118547453 Hebel

In [29]:
staatsexamen

Unnamed: 0,GND,staatsexamen_count
0,118505602,2.0
1,118507931,1.0
2,118509047,8.0
3,118509861,1.0
4,118510665,1.0
...,...,...
102,119390388,1.0
103,119527405,1.0
104,119539055,1.0
105,120455293,1.0


## schullist

In [30]:
schullist_files = [x for x in files if 'schullist' in x]

In [31]:
# prepare individual schullists
schullists = {}

for this_file in schullist_files:
    schullist_name = this_file.split("/")[-1].split(".")[0]
    schullist_df = pd.read_csv(this_file)
    schullist_df = schullist_df.query("GND.notna()")
    schullist_df['GND'] = schullist_df['GND'].astype(str)
    schullist_df['GND'] = schullist_df['GND'].apply(lambda x : x.split('/')[-1])
    schullist_df = expand_multiple_ids(schullist_df)
    schullist_df = schullist_df['GND'].value_counts().to_frame().reset_index()
    schullist_df = schullist_df.rename(columns={'count' : f'{schullist_name}_count'})
    schullist_df[f'{schullist_name}_share'] = schullist_df[f'{schullist_name}_count']/schullist_df[f'{schullist_name}_count'].sum()
    
    schullists[schullist_name] = schullist_df

In [32]:
# merge schullists
schullist_iter = iter(schullists.items())
first_name, schullist_merged = next(schullist_iter)

for schullist_name, schullist_df in schullist_iter:
    schullist_merged = pd.merge(
        schullist_merged,
        schullist_df,
        on='GND',
        how='outer',
    )
schullist_merged = schullist_merged.fillna(0).reset_index(drop=True)

In [33]:
schullist_merged.sort_values(by='schullist_bayern_share', ascending=False)

Unnamed: 0,GND,schullist_nordrheinwestfalen_count,schullist_nordrheinwestfalen_share,schullist_bayern_count,schullist_bayern_share,schullist_sachsenanhalt_count,schullist_sachsenanhalt_share,schullist_hamburg_count,schullist_hamburg_share,schullist_sachsen_count,...,schullist_saarland_count,schullist_saarland_share,schullist_hessen_count,schullist_hessen_share,schullist_bremen_count,schullist_bremen_share,schullist_brandenburg_count,schullist_brandenburg_share,schullist_badenwuerttemberg_count,schullist_badenwuerttemberg_share
119,118527908,0.0,0.0,6.0,0.023810,5.0,0.013263,0.0,0.0,3.0,...,4.0,0.051948,4.0,0.013746,0.0,0.000000,0.0,0.000000,3.0,0.00813
291,118607626,0.0,0.0,5.0,0.019841,7.0,0.018568,0.0,0.0,2.0,...,4.0,0.051948,6.0,0.020619,1.0,0.033333,3.0,0.115385,8.0,0.02168
234,118571680,0.0,0.0,4.0,0.015873,8.0,0.021220,0.0,0.0,1.0,...,0.0,0.000000,3.0,0.010309,0.0,0.000000,0.0,0.000000,2.0,0.00542
148,118540238,1.0,0.2,4.0,0.015873,5.0,0.013263,0.0,0.0,2.0,...,3.0,0.038961,9.0,0.030928,3.0,0.100000,1.0,0.038462,8.0,0.02168
312,118618725,0.0,0.0,4.0,0.015873,2.0,0.005305,0.0,0.0,0.0,...,1.0,0.012987,1.0,0.003436,0.0,0.000000,0.0,0.000000,2.0,0.00542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,1026215927,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000
4,1026450284,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,1.0,0.012987,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000
5,103127362X,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000
6,1038169879,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000


## abi

In [34]:
abi = pd.read_csv("../raw_data/abi.csv")
abi = abi[['GND', 'mentions']]
abi = abi.rename(columns={'mentions' : 'abi_mentions'})

In [35]:
abi

Unnamed: 0,GND,abi_mentions
0,120013428,1
1,119520621,1
2,118599631,1
3,118559427,1
4,118520040,1
...,...,...
72,118534262,33
73,118563076,37
74,118607626,41
75,118559230,47


## wiki

In [36]:
wiki = pd.read_csv("../raw_data/wiki.csv", index_col=[0])
wiki.columns = ['wiki_'+x if x != 'GND' else x for x in wiki.columns]

fill_columns = ['wiki_length_in_words', 'wiki_number_of_sitelinks', 'wiki_number_of_pageviews_last_5_years', 'wiki_number_of_in_links']
wiki[fill_columns] = wiki[fill_columns].fillna(0)

In [37]:
wiki

Unnamed: 0,wiki_page_found,wiki_page_url,wiki_page_title,wiki_item_id,wiki_length_in_words,wiki_number_of_sitelinks,wiki_number_of_pageviews_last_5_years,wiki_number_of_in_links,GND
0,True,https://de.wikipedia.org/wiki/Johann_Altenstaig,Johann Altenstaig,Q21544072,692.0,1.0,641.0,15.0,100010350
1,True,https://de.wikipedia.org/wiki/Andreas_Althamer,Andreas Althamer,Q96832,796.0,7.0,2129.0,38.0,100010423
2,True,https://de.wikipedia.org/wiki/Anna_Sophia_von_...,Anna Sophia von Hessen-Darmstadt,Q290123,782.0,14.0,2682.0,59.0,100011780
3,True,https://de.wikipedia.org/wiki/Paul_Bachmann_%2...,Paul Bachmann (Abt),Q2058772,441.0,1.0,643.0,22.0,100013945
4,True,https://de.wikipedia.org/wiki/Ignaz_Ferdinand_...,Ignaz Ferdinand Arnold,Q999874,672.0,3.0,1482.0,24.0,100017045
...,...,...,...,...,...,...,...,...,...
10381,True,https://de.wikipedia.org/wiki/Michael_D._Green,Michael D. Green,Q112512246,256.0,1.0,308.0,13.0,174039921
10382,False,,,,0.0,0.0,0.0,0.0,174061595
10383,False,,,,0.0,0.0,0.0,0.0,17406165X
10384,True,https://de.wikipedia.org/wiki/Irma_Emmrich,Irma Emmrich,Q56282814,508.0,1.0,1164.0,18.0,174166087


## kanonspiel

In [38]:
kanonspiel = pd.read_csv("../raw_data/kanonspiel.csv")
kanonspiel = kanonspiel.groupby('GND')['points'].sum().to_frame().reset_index()
kanonspiel = kanonspiel.rename(columns={'points' : 'kanonspiel_points'})

In [39]:
kanonspiel.sort_values(by='kanonspiel_points', ascending=False)

Unnamed: 0,GND,kanonspiel_points
26,118540238,4961
55,118577166,3444
47,118563076,3305
10,118514768,3225
65,118607626,3202
...,...,...
38,118552759,259
22,118530534,253
6,118510789,248
81,118706187,244


## segebrecht

In [40]:
segebrecht = pd.read_csv("../raw_data/segebrecht.csv", sep=";")
segebrecht = expand_multiple_ids(segebrecht)
segebrecht = segebrecht.query("GND.notna()")
segebrecht = segebrecht.query("type.isna()") # exclude anthologies
segebrecht = segebrecht['GND'].value_counts().to_frame().reset_index()
segebrecht = segebrecht.rename(columns={'count' : 'segebrecht_count'})

In [41]:
segebrecht

Unnamed: 0,GND,segebrecht_count
0,118540238,23
1,118607626,15
2,118534262,10
3,118514768,9
4,118577166,9
...,...,...
346,118636596,1
347,118599518,1
348,118625306,1
349,118696696,1


## vv_hein

In [42]:
vv_hein = pd.read_csv("../raw_data/vv_hein.csv", sep=";")
vv_hein = vv_hein.rename(columns={'GND-Nummer' : 'GND', 'Gesamt_count' : 'vv_hein_count'})
vv_hein = vv_hein.query("GND.notna()")
vv_hein['vv_hein_count'] = vv_hein['vv_hein_count'].apply(lambda x : x.split(' ')[0])
vv_hein = vv_hein[['GND', 'vv_hein_count']]

In [43]:
vv_hein

Unnamed: 0,GND,vv_hein_count
0,118542273,17
1,118543032,17
2,118590111,4
4,118630369,2
5,116551968,1
...,...,...
173,11850391X,1
174,118829211,1
175,118507397,1
176,118508091,1


## gnd

In [44]:
def convert_to_year(date):
    if pd.isna(date):
        return date
    else:
        date_parts = date.split('-')
        if date_parts[0] == '':
            bc = True
            date = date_parts[1]
        else:
            bc = False
            date = date_parts[0]
        date = date.replace('X', '0')
        match = re.findall(r"\d+", date)
        if match:
            date = int(match[0])
            if bc:
                date = -date
            return date
        else:
            print(date)
            return None

In [45]:
gnd = pd.read_csv("../raw_data/gnd.csv", index_col=[0])
gnd['GND_birth'] = gnd['GND_birth'].apply(convert_to_year)
gnd['GND_death'] = gnd['GND_death'].apply(convert_to_year)

In [46]:
gnd

Unnamed: 0,GND,GND_name,GND_gender,GND_birth,GND_death,GND_occupation,GND_country
0,100010350,"Altenstaig, Johann",Männlich,1480.0,1523.0,Humanist + Humanist + Theologe + Priester,Deutschland
1,100010423,"Althamer, Andreas",Männlich,1500.0,1539.0,Evangelischer Theologe + Reformator + Humanist...,Deutschland
2,100011780,"Anna Sophie, Hessen-Darmstadt, Landgräfin",Weiblich,1638.0,1683.0,Äbtissin,Deutschland
3,100013945,"Bachmann, Paul",Unbekannt,,1538.0,Mönch + Abt,Deutschland
4,100017045,"Arnold, Ignaz Ferdinand",Männlich,1774.0,1812.0,Schriftsteller + Jurist + Musikwissenschaftler...,Deutschland
...,...,...,...,...,...,...,...
10381,174039921,"Green, Michael D.",Unbekannt,1941.0,,Historiker,USA
10382,174061595,"Kröger, Wolfgang",Unbekannt,1947.0,2005.0,,
10383,17406165X,"Großmann, Dieter",Männlich,1921.0,1997.0,Kunsthistoriker + Archäologe,Polen + Deutschland
10384,174166087,"Emmrich, Irma",Weiblich,1919.0,2018.0,Kunsthistorikerin,Deutschland + Deutschland (DDR)


# merge

In [47]:
data = pd.merge(
    unilist_merged,
    litges_merged,
    on='GND',
    how='outer'
).merge(
    vv_merged,
    on='GND',
    how='outer'
).merge(
    killy,
    on='GND',
    how='outer'
).merge(
    reclam,
    on='GND',
    how='outer'
).merge(
    staatsexamen,
    on='GND',
    how='outer'
).merge(
    schullist_merged,
    on='GND',
    how='outer'
).merge(
    abi,
    on='GND',
    how='outer'
).merge(
    kanonspiel,
    on='GND',
    how='outer'
).merge(
    segebrecht,
    on='GND',
    how='outer'
).merge(
    vv_hein,
    on='GND',
    how='outer'
).fillna(0).reset_index(drop=True)

In [48]:
data = data.merge(
    gnd,
    on='GND',
    how='left'
)

data = data.merge(
    wiki,
    on='GND',
    how='left'
)

data = data.merge(
    bdsl,
    on='GND',
    how='left'
)

In [49]:
data.sort_values(by='page_count_litges_brenner', ascending=False)

Unnamed: 0,GND,prob_to_read_sum_unilist_heidelberg_2017,prob_to_read_share_unilist_heidelberg_2017,prob_to_read_sum_unilist_aachen_201718,prob_to_read_share_unilist_aachen_201718,prob_to_read_sum_unilist_goettingen,prob_to_read_share_unilist_goettingen,prob_to_read_sum_unilist_luxemburg,prob_to_read_share_unilist_luxemburg,prob_to_read_sum_unilist_graz_2021,...,wiki_page_found,wiki_page_url,wiki_page_title,wiki_item_id,wiki_length_in_words,wiki_number_of_sitelinks,wiki_number_of_pageviews_last_5_years,wiki_number_of_in_links,BDSL_searchphrase,BDSL_hits_2000_all
3457,118540238,5.727273,0.086515,2.492755,0.040582,6.000000,0.093750,7.0,0.097222,4.0,...,True,https://de.wikipedia.org/wiki/Johann_Wolfgang_...,Johann Wolfgang von Goethe,Q5879,36790.0,272.0,3648043.0,11289.0,Johann Wolfgang von Goethe,10227.0
4040,118577166,1.194805,0.018048,0.176000,0.002865,3.000000,0.046875,1.0,0.013889,1.0,...,True,https://de.wikipedia.org/wiki/Thomas_Mann,Thomas Mann,Q37030,15597.0,160.0,2329343.0,3879.0,Thomas Mann,4798.0
3072,118514768,1.728571,0.026111,0.691000,0.011249,1.000000,0.015625,1.0,0.013889,3.0,...,True,https://de.wikipedia.org/wiki/Bertolt_Brecht,Bertolt Brecht,Q38757,18391.0,152.0,2315455.0,5041.0,Bertolt Brecht,3179.0
4468,118607626,3.636364,0.054930,2.216484,0.036084,3.333333,0.052083,3.0,0.041667,2.0,...,True,https://de.wikipedia.org/wiki/Friedrich_Schiller,Friedrich Schiller,Q22670,13856.0,180.0,2042583.0,6329.0,Friedrich Schiller,3890.0
3481,118541579,0.142857,0.002158,1.000000,0.016280,1.000000,0.015625,1.0,0.013889,1.0,...,True,https://de.wikipedia.org/wiki/G%C3%BCnter_Grass,Günter Grass,Q6538,11937.0,137.0,984546.0,1685.0,Günter Grass,1685.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9591,173023126,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,False,,,,0.0,0.0,0.0,0.0,Günther Distelrath,0.0
9592,173054072,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,False,,,,0.0,0.0,0.0,0.0,Marc Olivier Baruch,0.0
9593,173090257,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,False,,,,0.0,0.0,0.0,0.0,Bill Brown,0.0
9594,173150594,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,False,,,,0.0,0.0,0.0,0.0,Uta Plate,0.0


In [50]:
data['GND'].value_counts()

GND
189568798    1
100010350    1
100010423    1
100011780    1
100013945    1
            ..
100092381    1
100099459    1
100101844    1
100110274    1
10011153X    1
Name: count, Length: 9628, dtype: int64

In [51]:
data.query("GND.str.contains(r'\\+')")

Unnamed: 0,GND,prob_to_read_sum_unilist_heidelberg_2017,prob_to_read_share_unilist_heidelberg_2017,prob_to_read_sum_unilist_aachen_201718,prob_to_read_share_unilist_aachen_201718,prob_to_read_sum_unilist_goettingen,prob_to_read_share_unilist_goettingen,prob_to_read_sum_unilist_luxemburg,prob_to_read_share_unilist_luxemburg,prob_to_read_sum_unilist_graz_2021,...,wiki_page_found,wiki_page_url,wiki_page_title,wiki_item_id,wiki_length_in_words,wiki_number_of_sitelinks,wiki_number_of_pageviews_last_5_years,wiki_number_of_in_links,BDSL_searchphrase,BDSL_hits_2000_all


In [52]:
data.query("GND.str.contains(r'n')")

Unnamed: 0,GND,prob_to_read_sum_unilist_heidelberg_2017,prob_to_read_share_unilist_heidelberg_2017,prob_to_read_sum_unilist_aachen_201718,prob_to_read_share_unilist_aachen_201718,prob_to_read_sum_unilist_goettingen,prob_to_read_share_unilist_goettingen,prob_to_read_sum_unilist_luxemburg,prob_to_read_share_unilist_luxemburg,prob_to_read_sum_unilist_graz_2021,...,wiki_page_found,wiki_page_url,wiki_page_title,wiki_item_id,wiki_length_in_words,wiki_number_of_sitelinks,wiki_number_of_pageviews_last_5_years,wiki_number_of_in_links,BDSL_searchphrase,BDSL_hits_2000_all


In [53]:
data.query("GND_name.isna()")

Unnamed: 0,GND,prob_to_read_sum_unilist_heidelberg_2017,prob_to_read_share_unilist_heidelberg_2017,prob_to_read_sum_unilist_aachen_201718,prob_to_read_share_unilist_aachen_201718,prob_to_read_sum_unilist_goettingen,prob_to_read_share_unilist_goettingen,prob_to_read_sum_unilist_luxemburg,prob_to_read_share_unilist_luxemburg,prob_to_read_sum_unilist_graz_2021,...,wiki_page_found,wiki_page_url,wiki_page_title,wiki_item_id,wiki_length_in_words,wiki_number_of_sitelinks,wiki_number_of_pageviews_last_5_years,wiki_number_of_in_links,BDSL_searchphrase,BDSL_hits_2000_all


In [54]:
data.query("wiki_page_found.isna()")

Unnamed: 0,GND,prob_to_read_sum_unilist_heidelberg_2017,prob_to_read_share_unilist_heidelberg_2017,prob_to_read_sum_unilist_aachen_201718,prob_to_read_share_unilist_aachen_201718,prob_to_read_sum_unilist_goettingen,prob_to_read_share_unilist_goettingen,prob_to_read_sum_unilist_luxemburg,prob_to_read_share_unilist_luxemburg,prob_to_read_sum_unilist_graz_2021,...,wiki_page_found,wiki_page_url,wiki_page_title,wiki_item_id,wiki_length_in_words,wiki_number_of_sitelinks,wiki_number_of_pageviews_last_5_years,wiki_number_of_in_links,BDSL_searchphrase,BDSL_hits_2000_all


In [55]:
data.query("BDSL_searchphrase.isna()")

Unnamed: 0,GND,prob_to_read_sum_unilist_heidelberg_2017,prob_to_read_share_unilist_heidelberg_2017,prob_to_read_sum_unilist_aachen_201718,prob_to_read_share_unilist_aachen_201718,prob_to_read_sum_unilist_goettingen,prob_to_read_share_unilist_goettingen,prob_to_read_sum_unilist_luxemburg,prob_to_read_share_unilist_luxemburg,prob_to_read_sum_unilist_graz_2021,...,wiki_page_found,wiki_page_url,wiki_page_title,wiki_item_id,wiki_length_in_words,wiki_number_of_sitelinks,wiki_number_of_pageviews_last_5_years,wiki_number_of_in_links,BDSL_searchphrase,BDSL_hits_2000_all


In [56]:
data.query("GND_birth >= 1994")

Unnamed: 0,GND,prob_to_read_sum_unilist_heidelberg_2017,prob_to_read_share_unilist_heidelberg_2017,prob_to_read_sum_unilist_aachen_201718,prob_to_read_share_unilist_aachen_201718,prob_to_read_sum_unilist_goettingen,prob_to_read_share_unilist_goettingen,prob_to_read_sum_unilist_luxemburg,prob_to_read_share_unilist_luxemburg,prob_to_read_sum_unilist_graz_2021,...,wiki_page_found,wiki_page_url,wiki_page_title,wiki_item_id,wiki_length_in_words,wiki_number_of_sitelinks,wiki_number_of_pageviews_last_5_years,wiki_number_of_in_links,BDSL_searchphrase,BDSL_hits_2000_all
946,1072059940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,https://de.wikipedia.org/wiki/Elias_Hirschl,Elias Hirschl,Q28984731,1731.0,2.0,48402.0,34.0,Elias Hirschl,6.0
1288,1147517223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,,,,0.0,0.0,0.0,0.0,Kristin Höller,1.0
2848,1182074820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,https://de.wikipedia.org/wiki/Greta_Thunberg,Greta Thunberg,Q56434717,13918.0,145.0,2459970.0,428.0,Greta Thunberg,2.0
9102,1307416098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,,,,0.0,0.0,0.0,0.0,Asin Andkohiy,1.0
9195,1324565837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,,,,0.0,0.0,0.0,0.0,Ibrahim Bah,0.0


In [57]:
# gnd_occupations = data['GND_occupation'].dropna().tolist()
# gnd_occupations = [x.split(' + ') for x in gnd_occupations]
# gnd_occupations = [item for sublist in gnd_occupations for item in sublist]
# pd.Series(gnd_occupations).value_counts().to_csv("gnd_occupations.csv")

In [58]:
data.to_csv("../data/data.csv")