In [2]:
import pandas as pd
import plotly.express as px
from collections import Counter

# By Gender

In [8]:
people = pd.read_csv("../data/people.csv", sep=";", encoding="utf-8", index_col=0).reset_index(drop=True)
people_grouped = people.groupby("Geschlecht").sum()

In [9]:
# Compute counts and proportions for each gender
gender_counts = people["Geschlecht"].value_counts()
gender_proportions = people["Geschlecht"].value_counts(normalize=True)

# Create a new DataFrame
gender_stats = pd.DataFrame({
    'Anzahl': gender_counts,
    'Anteil': gender_proportions
})

# Fill in any missing genders with 0
gender_stats = gender_stats.reindex(['Weiblich', 'Männlich', 'Unbekannt'], fill_value=0)
gender_stats["gender"] = ["female", "male", "unknown"]

gender_stats

Unnamed: 0_level_0,Anzahl,Anteil,gender
Geschlecht,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Weiblich,980,0.236031,female
Männlich,3047,0.733863,male
Unbekannt,125,0.030106,unknown


In [13]:
fig = px.pie(gender_stats.reset_index(), values='Anzahl', names='gender', 
             color_discrete_sequence=["peru", "maroon", "lightgray"],
            title="The chart shows the gender proportion for all people that could be identified")
fig.write_image("../figures/gender_proportions/all_people.png",  scale=2)
fig.show()

# Occupations

In [14]:
berufe = []
for bs in people["Beruf oder Beschäftigung"]:
    if type(bs) == str:
        bs = bs.split(";")
        for b in bs:
            berufe.append(b.strip())

berufsdf = pd.DataFrame({"Profession": list(Counter(berufe).keys()), "Count": list(Counter(berufe).values())})


In [18]:
fig = px.bar(berufsdf.sort_values(by="Count", ascending=False).head(15), 
             x="Profession", y="Count",\
            color_discrete_sequence=["royalblue"])
fig.show()
fig.write_image("../figures/people_stats/occupations.png",  scale=2)
# eine person kann mehrere Berufe haben

# Only writers

In [22]:
# only authors

people = people[~people["Beruf oder Beschäftigung"].isna()]
authors1 = people[people["Beruf oder Beschäftigung"].str.contains(r"Schriftsteller", na=True)]
authors2 = people[people["Beruf oder Beschäftigung"].str.contains(r"Lyriker", na=True)]
authors3 = people[people["Beruf oder Beschäftigung"].str.contains(r"Dramatiker", na=True)]
authors4 = people[people["Beruf oder Beschäftigung"].str.contains(r"Erzähler", na=True)]
authors = pd.concat([authors1, authors2, authors3, authors4])
authors = authors.drop_duplicates(subset=["GND-Nummer"])

In [23]:
len(authors)

1845

## by gender

In [24]:
# Compute counts and proportions for each gender
gender_counts = authors["Geschlecht"].value_counts()
gender_proportions = authors["Geschlecht"].value_counts(normalize=True)

# Create a new DataFrame
gender_stats = pd.DataFrame({
    'Anzahl': gender_counts,
    'Anteil': gender_proportions
})

# Fill in any missing genders with 0
gender_stats = gender_stats.reindex(['Weiblich', 'Männlich', 'Unbekannt'], fill_value=0)
gender_stats["gender"] = ["female", "male", "unknown"]

gender_stats

Unnamed: 0_level_0,Anzahl,Anteil,gender
Geschlecht,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Weiblich,445,0.241192,female
Männlich,1370,0.742547,male
Unbekannt,30,0.01626,unknown


In [28]:
fig = px.pie(gender_stats.reset_index(), values='Anzahl', names='gender', 
             color_discrete_sequence=["peru", "maroon", "lightgray"],
            title="The chart shows the gender proportion for the identified people")
fig.write_image("../figures/gender_proportions/authors.png",  scale=2)
fig.show()

In [29]:
gender_stats

Unnamed: 0_level_0,Anzahl,Anteil,gender
Geschlecht,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Weiblich,445,0.241192,female
Männlich,1370,0.742547,male
Unbekannt,30,0.01626,unknown
