In [1]:
import pandas as pd
import plotly.express as px
from tqdm import tqdm
import numpy as np
import re
from scipy.spatial.distance import jensenshannon
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy.stats import kendalltau
import ast

# Read in the data

In [3]:
lvs = pd.read_csv("../data/lvs.csv", sep=";", encoding="utf-8", index_col=0)

# Transform column values back into lists
lvs["NER_GNDs_Inhalt"] = lvs["NER_GNDs_Inhalt"].apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
lvs["NER_GNDs_Titel"] = lvs["NER_GNDs_Inhalt"].apply(lambda x: ast.literal_eval(x) if type(x) == str else x)

# Clean columns (Quotation marks inside NEs cause problems when converting string to list)
for x in ["Titel", "Inhalt"]:
    # alle Anführungszeichen entfernen
    lvs[f"NER_Pers_{x}_lemm"] = [re.sub("'", "", x) if type(x) == str else x for x in lvs[f"NER_Pers_{x}_lemm"]]
    lvs[f"NER_Pers_{x}_lemm"] = [re.sub("\"", "", x) if type(x) == str else x for x in lvs[f"NER_Pers_{x}_lemm"]]
    # vor jedem komma ein '
    lvs[f"NER_Pers_{x}_lemm"] = [re.sub(",", "',", x) if type(x) == str else x for x in lvs[f"NER_Pers_{x}_lemm"]]
    # nach jedem komma ein '
    lvs[f"NER_Pers_{x}_lemm"] = [re.sub(",", ", '", x) if type(x) == str else x for x in lvs[f"NER_Pers_{x}_lemm"]]
    # vor jeder klammer ein '
    lvs[f"NER_Pers_{x}_lemm"] = [re.sub("\]", "']", x) if type(x) == str else x for x in lvs[f"NER_Pers_{x}_lemm"]]
    # nach jeder Klammer ein '
    lvs[f"NER_Pers_{x}_lemm"] = [re.sub("\[", "['", x) if type(x) == str else x for x in lvs[f"NER_Pers_{x}_lemm"]]
    # alle doppelanführungszeichen entfernen
    lvs[f"NER_Pers_{x}_lemm"] = [re.sub("'+", "'", x) if type(x) == str else x for x in lvs[f"NER_Pers_{x}_lemm"]]


lvs["NER_Pers_Inhalt_lemm"] = lvs["NER_Pers_Inhalt_lemm"].apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
lvs["NER_Pers_Titel_lemm"] = lvs["NER_Pers_Titel_lemm"].apply(lambda x: ast.literal_eval(x) if type(x) == str else x)

# Constrain to time period from winter semester 18/19 until winter semester 23/24
lvs = lvs[lvs["Intervall_Start"]>"2018-08"]

people = pd.read_csv("../data/people.csv", sep=";", encoding="utf-8", index_col=0)



In [5]:
# Select only writers

people = people[~people["Beruf oder Beschäftigung"].isna()]
authors1 = people[people["Beruf oder Beschäftigung"].str.contains(r"Schriftsteller", na=True)]
authors2 = people[people["Beruf oder Beschäftigung"].str.contains(r"Lyriker", na=True)]
authors3 = people[people["Beruf oder Beschäftigung"].str.contains(r"Dramatiker", na=True)]
authors4 = people[people["Beruf oder Beschäftigung"].str.contains(r"Erzähler", na=True)]

authors = pd.concat([authors1, authors2, authors3, authors4])
authors = authors.drop_duplicates(subset=["GND-Nummer"])

# Compute number of mentions of people/authors per uni

In [7]:
import pandas as pd

# Sample data setup (replace with your actual data)
# lvs = pd.DataFrame(...)
# authors = pd.DataFrame(...)

# List of universities
unis = ["wien", "graz", "basel", "chemnitz", "erfurt", "halle", "mainz", "marburg", "stuttgart"]

# Filter GNDs based on authors["GND-Nummer"]
valid_gnds = set(authors['GND-Nummer'])

# Explode the columns and compute counts per university
result = (
    lvs.explode("NER_Pers_Inhalt_lemm")
    .groupby('Universität')['NER_Pers_Inhalt_lemm']
    .agg(['size', lambda x: x.nunique()])
    .reset_index()
)
result.columns = ['University', 'Number of identified NEs (total)', 'Unique identified NEs']

# Explode the NER_GNDs_Inhalt column and compute counts per university
result_gnds = (
    lvs.explode("NER_GNDs_Inhalt")
    .groupby('Universität')['NER_GNDs_Inhalt']
    .agg(['size', lambda x: x.nunique(), lambda x: len(set(x) & valid_gnds)])
    .reset_index()
)
result_gnds.columns = ['University', 'Number of identified GNDs (total)', 'Unique identified GNDs', 'Unique identified (authors)']

# Compute the number of author GNDs per university
lvs_exploded = lvs.explode("NER_GNDs_Inhalt")
author_gnds_count = (
    lvs_exploded[lvs_exploded["NER_GNDs_Inhalt"].isin(valid_gnds)]
    .groupby('Universität').size()
    .reindex(unis, fill_value=0)
    .reset_index()
)
author_gnds_count.columns = ['University', 'Number of author GNDs']

# Merge all results
result = result.merge(result_gnds, on='University')
result = result.merge(author_gnds_count, on='University')

# Rename the columns for clarity
result = result.rename(columns={
    'Unique identified NEs': 'Unique identified NEs',
    'Unique identified GNDs': 'Unique identified GNDs',
    'Unique identified (authors)': 'Unique identified GNDs (authors)'
})

result

Unnamed: 0,University,Number of identified NEs (total),Unique identified NEs,Number of identified GNDs (total),Unique identified GNDs,Unique identified GNDs (authors),Number of author GNDs
0,basel,1537,891,1283,504,313,854
1,chemnitz,419,200,325,122,87,197
2,erfurt,457,213,383,138,86,195
3,graz,762,381,684,250,203,374
4,halle,3807,1826,3285,1070,528,1918
5,mainz,687,286,628,182,133,383
6,marburg,2684,1611,2087,901,401,1081
7,stuttgart,1881,1010,1694,615,374,1039
8,wien,1149,671,1010,443,309,707


In [12]:
#result_sorted = result.sort_values(by="Unique identified NEs", ascending=False)
fig = px.bar(result, x="University", y=["Unique identified NEs", "Unique identified GNDs", "Unique identified GNDs (authors)"],
            barmode="group", color_discrete_sequence=["orange", "mediumorchid", "royalblue"])
fig.show()

# Compute number of mention per uni, country and author

In [16]:
result = result.set_index("University")

In [17]:
# Explode the lists in 'people_ids' column
lvs_exploded_inhalt = lvs.explode("NER_GNDs_Inhalt")

# Merge the exploded events DataFrame with the people DataFrame
merged_df = lvs_exploded_inhalt.merge(authors, left_on='NER_GNDs_Inhalt', right_on='GND-Nummer')

# Group by person and location, and count the number of involvements
grouped_df = merged_df.groupby(['GND-Nummer', 'Universität']).size().reset_index(name='count')

# Pivot the table to get the desired format
pivot_df = grouped_df.pivot(index='GND-Nummer', columns='Universität', values='count').fillna(0)

# Reset the index to get a clean DataFrame
result_authors_per_uni = pivot_df.reset_index().rename_axis(None, axis=1)

for col in result_authors_per_uni.columns:
    if col != "GND-Nummer":
        result_authors_per_uni[col  +"_rel"] = result_authors_per_uni[col]/result["Number of author GNDs"].loc[col]
result_authors_per_uni

Unnamed: 0,GND-Nummer,basel,chemnitz,erfurt,graz,halle,mainz,marburg,stuttgart,wien,basel_rel,chemnitz_rel,erfurt_rel,graz_rel,halle_rel,mainz_rel,marburg_rel,stuttgart_rel,wien_rel
0,100232000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.002674,0.000000,0.0,0.000000,0.00000,0.000000
1,100689590,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000521,0.0,0.000925,0.00000,0.000000
2,1011358441,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000521,0.0,0.000000,0.00000,0.000000
3,10124505X,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001171,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000
4,1012609324,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.003513,0.0,0.0,0.000000,0.000000,0.0,0.000925,0.00000,0.005658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231,143017152,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000521,0.0,0.000000,0.00000,0.000000
1232,143302795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.002829
1233,143575937,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001171,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000
1234,143811959,3.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,0.003513,0.0,0.0,0.000000,0.000000,0.0,0.000925,0.00385,0.000000


In [20]:
result_authors_per_uni["inhalt_total"] = 0
for col in result_authors_per_uni.columns:
    if col != "GND-Nummer" and col != "inhalt_total" and "rel" not in col:
        result_authors_per_uni["inhalt_total"] += result_authors_per_uni[col]

In [22]:
result_authors_per_uni.sort_values(by="inhalt_total", ascending=False).head()

Unnamed: 0,GND-Nummer,basel,chemnitz,erfurt,graz,halle,mainz,marburg,stuttgart,wien,basel_rel,chemnitz_rel,erfurt_rel,graz_rel,halle_rel,mainz_rel,marburg_rel,stuttgart_rel,wien_rel,inhalt_total
347,118540238,27.0,12.0,9.0,14.0,83.0,29.0,40.0,62.0,15.0,0.031616,0.060914,0.046154,0.037433,0.043274,0.075718,0.037003,0.059673,0.021216,291.0
579,118607626,20.0,4.0,7.0,4.0,58.0,13.0,18.0,35.0,12.0,0.023419,0.020305,0.035897,0.010695,0.03024,0.033943,0.016651,0.033686,0.016973,171.0
406,118552465,14.0,8.0,12.0,5.0,41.0,6.0,28.0,33.0,7.0,0.016393,0.040609,0.061538,0.013369,0.021376,0.015666,0.025902,0.031761,0.009901,154.0
244,118514768,15.0,10.0,1.0,5.0,51.0,19.0,9.0,14.0,19.0,0.017564,0.050761,0.005128,0.013369,0.02659,0.049608,0.008326,0.013474,0.026874,143.0
451,118563076,23.0,5.0,4.0,2.0,35.0,9.0,23.0,30.0,9.0,0.026932,0.025381,0.020513,0.005348,0.018248,0.023499,0.021277,0.028874,0.01273,140.0


In [23]:
# Merge the exploded events DataFrame with the people DataFrame
authors_with_counts = authors.merge(result_authors_per_uni, left_on='GND-Nummer', right_on='GND-Nummer')

In [26]:
# Add relative counts for countries
authors_with_counts["Ö_inhalt_relrel"] = (authors_with_counts["graz_rel"]+authors_with_counts["wien_rel"])/2
authors_with_counts["S_inhalt_relrel"] =  authors_with_counts["basel_rel"]
authors_with_counts["D_inhalt_relrel"] = (authors_with_counts["mainz_rel"] + authors_with_counts["stuttgart_rel"] +
                                          authors_with_counts["marburg_rel"] + authors_with_counts["chemnitz_rel"] +
                                          authors_with_counts["erfurt_rel"] + authors_with_counts["halle_rel"])/2


In [27]:
# Add general relative count
authors_with_counts["inhalt_relrelrel"] = (authors_with_counts["Ö_inhalt_relrel"] +
                                               authors_with_counts["D_inhalt_relrel"]+
                                               authors_with_counts["S_inhalt_relrel"])/3
authors_with_counts.head()

Unnamed: 0,Bevorzugter Name,Ländercode,Beruf oder Beschäftigung,GND-Nummer,Geburtsdatum,Geschlecht,Austrian,German,Swiss,Land,...,halle_rel,mainz_rel,marburg_rel,stuttgart_rel,wien_rel,inhalt_total,Ö_inhalt_relrel,S_inhalt_relrel,D_inhalt_relrel,inhalt_relrelrel
0,"Hanika, Iris",Deutschland,Schriftsteller,124637140,1962,Weiblich,0.0,1.0,0.0,D,...,0.000521,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000261,8.7e-05
1,"Kirsch, Sarah",Deutschland,Lyriker; Maler; Biolog; Librettist; Schriftste...,118562487,1935-04-16,Weiblich,0.0,1.0,0.0,D,...,0.00365,0.0,0.000925,0.001925,0.0,11.0,0.001337,0.0,0.00325,0.001529
2,"Savinkov, Boris V.",Russland;Ukraine,Schriftsteller; Politiker,118804855,1879,Männlich,0.0,0.0,0.0,,...,0.000521,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000261,8.7e-05
3,"Smith, Patti",USA,Musiker; Rocksänger; Schriftsteller; Zeichner;...,118748556,1946-12-30,Weiblich,0.0,0.0,0.0,,...,0.0,0.0,0.0,0.000962,0.0,1.0,0.0,0.0,0.000481,0.00016
4,"Salomon, Ernst von",USA;Deutschland,Schriftsteller; Drehbuchautor; Schriftsteller,118605151,1902-09-25,Männlich,0.0,1.0,0.0,D,...,0.000521,0.002611,0.0,0.0,0.0,3.0,0.001337,0.0,0.001566,0.000968


In [31]:
authors_with_counts.to_csv("../data/authors.csv", sep=";", encoding="utf-8")