# import

In [1]:
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import plotly.express as px

In [2]:
data = pd.read_csv("../data/data.csv", index_col=[0])

In [3]:
academic_indicator_dict = {
    'unilist' : [x for x in data.columns if 'prob_to_read_share' in x],
    'litges' : [x for x in data.columns if 'page_count_rel_litges' in x],
    'vv' : [x for x in data.columns if 'event_count_rel_vv' in x],
    'lexica' : ['killy_length'],
    'editions' : ['reclam_count'],
    'bibliographies' : ['BDSL_hits_2000_all']
}

other_indicator_dict = {
    'staatsexamen' : ['staatsexamen_count'],
    'schullist' : [x for x in data.columns if 'schullist_' in x and 'share' in x],
    'abi' : ['abi_mentions'],
    'kanonspiel' : ['kanonspiel_points'],
    'segebrecht' : ['segebrecht_count'],
    'vv_hein' : ['vv_hein_count'],
    'wiki' : ['wiki_length_in_words']
}

all_indicator_dict = {**academic_indicator_dict, **other_indicator_dict}

# gnd filter

In [4]:
relevant_occupations = [
    'Schriftsteller', 'Schriftstellerin', 'Dramatiker',
    'Lyriker', 'Drehbuchautor', 'Librettist',
    'Autor', 'Lyrikerin', 'Kirchenlieddichter',
    'Erzähler', 'Drehbuchautorin', 'Kinderbuchautor',
    'Dramatikerin', 'Librettistin', 'Liederdichter',
    'Kinderbuchautorin', 'Autorin', 'Jugendbuchautor',
    'Minnesänger', 'Mundartschriftsteller', 'Jugendbuchautorin',
    'Musikschriftsteller', 'Romancier', 'Reiseschriftsteller',
    'Meistersinger', 'Kriminalschriftsteller', 'Liedermacher',
    'Prosaist', 'Kirchenlieddichterin', 'Prosaistin',
    'Theaterdichter', 'Heimatschriftsteller', 'Erzählerin',
    'Spruchdichter', 'Mundartschriftstellerin', 'Romanschriftstellerin',
    'Liederdichterin', 'Autorenlesung', 'Musikschriftstellerin',
    'Liedermacherin', 'Bilderbuchautor', 'Stadtschreiberin <Literatur>',
    'Kriminalschriftstellerin', 'Exilschriftsteller', 'Comicautorin',
    'Bestsellerautorin', 'Reiseschriftstellerin'
]

relevant_countries = [
    'Deutschland',
    'Österreich',
    'Schweiz'
]

In [5]:
data = (
    data
    .query("GND_birth >= 1550")
    .loc[data['GND_occupation'].str.contains('|'.join(relevant_occupations), na=False)]
    .loc[data['GND_country'].str.contains('|'.join(relevant_countries), na=False)]
).copy()

data.shape[0]

5270

# create scores, etc.

In [6]:
# (1) apply zscore and rank on secondary indicators
for primary_indicator, secondary_indicators in all_indicator_dict.items():
    for secondary_indicator in secondary_indicators:
        data[secondary_indicator+'_zscore'] = zscore(data[secondary_indicator])
        data[secondary_indicator+'_rank'] = data[secondary_indicator].rank(method='min', ascending=False).astype(int)
    data = data.copy()

In [7]:
# (2) create score for primary indicators by calculating mean of zscored secondary indicators
for primary_indicator, secondary_indicators in all_indicator_dict.items():
    data[primary_indicator] = data[[x+'_zscore' for x in secondary_indicators]].mean(axis=1)
data = data.copy()

In [8]:
# (3) apply zscore and rank on primary indicators
for primary_indicator, secondary_indicators in all_indicator_dict.items():
    data[primary_indicator+'_zscore'] = zscore(data[primary_indicator])
    data[primary_indicator+'_rank'] = data[primary_indicator].rank(method='min', ascending=False).astype(int)
data = data.copy()

In [9]:
# (4) create overall canonicity score by calculating mean of zscored primary indicators
#     then apply rank and scaling on overall canonictiy score

# scaler = MinMaxScaler(feature_range=(1, 1000))

# data['canonicity_score_raw'] = data[[x+'_zscore' for x in academic_indicator_dict.keys()]].mean(axis=1)
# data['canonicity_score_rank'] = data['canonicity_score_raw'].rank(method='min', ascending=False).astype(int)
# data['canonicity_score_scaled'] = scaler.fit_transform(data[['canonicity_score_raw']]).round().astype(int)

# explore

In [10]:
# data.sort_values(by='canonicity_score_raw', ascending=False)[[
#     'GND', 'GND_name',
#     'canonicity_score_rank', 'canonicity_score_scaled', 'canonicity_score_raw',
# ]].head(20)

In [11]:
data.sort_values(by='litges_rank', ascending=True)[[
    'GND_name',
    'litges_rank', 
    'page_count_rel_litges_beutin_rank', 'page_count_rel_litges_brenner_rank',
]].head(20)

Unnamed: 0,GND_name,litges_rank,page_count_rel_litges_beutin_rank,page_count_rel_litges_brenner_rank
3317,"Goethe, Johann Wolfgang von",1,2,1
2945,"Brecht, Bertolt",2,1,3
4287,"Schiller, Friedrich",3,5,4
3879,"Mann, Thomas",4,13,2
3556,"Huchel, Peter",5,2,76
3809,"Lessing, Gotthold Ephraim",6,9,7
3454,"Heine, Heinrich",7,6,16
3340,"Grass, Günter",8,21,5
3791,"Lehmann, Wilhelm",9,4,116
4678,"Wolf, Christa",10,7,11


In [12]:
data.sort_values(by='vv_rank', ascending=True)[[
    'GND_name',
    'vv_rank', 
    'event_count_rel_vv_stuttgart_rank', 'event_count_rel_vv_mainz_rank', 'event_count_rel_vv_wien_rank', 
    'vv_hein_rank',
]].head(20)

Unnamed: 0,GND_name,vv_rank,event_count_rel_vv_stuttgart_rank,event_count_rel_vv_mainz_rank,event_count_rel_vv_wien_rank,vv_hein_rank
3317,"Goethe, Johann Wolfgang von",1,1,1,2,1
2945,"Brecht, Bertolt",2,6,2,2,5
4287,"Schiller, Friedrich",3,2,3,13,3
3627,"Kafka, Franz",4,3,15,2,9
3687,"Kleist, Heinrich von",5,4,5,13,7
3535,"Hoffmann, E. T. A.",6,4,10,13,15
3809,"Lessing, Gotthold Ephraim",7,6,15,13,4
2871,"Benjamin, Walter",8,25,10,6,151
6387,"Jelinek, Elfriede",9,123,74,1,151
4200,"Rilke, Rainer Maria",10,8,34,8,17
