In [1]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import openpyxl
import numpy as np
import plotly.graph_objects as go
from collections import Counter
import pygwalker as pyg

def extract_categories(row):
    """Extracts categories from a row and returns them as a sorted set."""
    categories = {row['Category 1'], row['Category 2'], row['Category 3']}
    # Remove NaN values
    categories = {cat for cat in categories if pd.notna(cat)}
    return frozenset(categories)



### Preprocessing and cleaning

In [4]:
file_m = "DH_Keywords_Categorisation_Martin_V1.xlsx"
file_n = "DH_Keywords_Categorisation_Nina_V1.xlsx"
df_martin = pd.read_excel(file_m)
df_nina = pd.read_excel(file_n)

In [5]:
#Zusammenführen der Kategorien "too broad/unspecific", "unclear" and "too specific" 
#zu "not categorizable"

values_to_replace = ['too broad/unspecific', 'too specific', 'unclear']

# Columns to operate on
columns = ['Category 1', 'Category 2', 'Category 3']

# Replace the values in the specified columns
df_martin[columns] = df_martin[columns].replace(values_to_replace, 'not categorizable')
df_nina[columns] = df_nina[columns].replace(values_to_replace, 'not categorizable')

In [6]:
#Zusammenführen von Kategorisierungen zu frozensets (damit Reihenfolge egal) 
# und merge zu einem gemeinsamen Dataframe
df_martin['Categories'] = df_martin.apply(extract_categories, axis=1)
df_nina['Categories'] = df_nina.apply(extract_categories, axis=1)
df = df_martin.merge(df_nina, on='Tag', how='inner')
df

Unnamed: 0,Tag,variant_x,Vocabularies (excl. Zotero_DHA),Number of Vocabularies (excl. Zotero_DHA),Category 1_x,Category 2_x,Category 3_x,Count_Zotero_DHA,Categories_x,variant_y,Vocabularies,Number of Vocabularies,Category 1_y,Category 2_y,Category 3_y,Count,Categories_y
0,1922,,,,time period,,,1.0,(time period),,,,time period,,,1.0,(time period)
1,2011,,,,time period,,,1.0,(time period),,,,time period,,,1.0,(time period)
2,#nosource,,,,not categorizable,,,4.0,(not categorizable),,,,not categorizable,,,4.0,(not categorizable)
3,(meta)data,,SSHOC,1.0,object-type,,,,(object-type),,SSHOC,1.0,object-type,,,,(object-type)
4,(quantitative) survey research,,SSHOC,1.0,topic,,,,(topic),,SSHOC,1.0,method/activity,,,,(method/activity)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3777,zustellung,,ARCHE,1.0,,,,,(),,ARCHE,1.0,topic,,,,(topic)
3778,zweiter weltkrieg,,ARCHE,1.0,time period,topic,entity (event),,"(entity (event), time period, topic)",,ARCHE,1.0,time period,,,,(time period)
3779,zwischenkriegszeit,,ARCHE,1.0,time period,,,,(time period),,ARCHE,1.0,time period,,,,(time period)
3780,zwischenstaatliche vereinbarung,,ARCHE,1.0,,,,,(),,ARCHE,1.0,topic,,,,(topic)


In [7]:
#nur DHA-Zotero-Daten

df_dha = df[df['Count'].between(1, 1000)]
df_dha

Unnamed: 0,Tag,variant_x,Vocabularies (excl. Zotero_DHA),Number of Vocabularies (excl. Zotero_DHA),Category 1_x,Category 2_x,Category 3_x,Count_Zotero_DHA,Categories_x,variant_y,Vocabularies,Number of Vocabularies,Category 1_y,Category 2_y,Category 3_y,Count,Categories_y
0,1922,,,,time period,,,1.0,(time period),,,,time period,,,1.0,(time period)
1,2011,,,,time period,,,1.0,(time period),,,,time period,,,1.0,(time period)
2,#nosource,,,,not categorizable,,,4.0,(not categorizable),,,,not categorizable,,,4.0,(not categorizable)
5,(teil-)automatisch generiert,,,,not categorizable,,,4.0,(not categorizable),,,,not categorizable,,,4.0,(not categorizable)
7,1066-1485,,,,time period,,,1.0,(time period),,,,time period,,,1.0,(time period)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3740,xml,,"ARCHE, DARIAH Campus, dha taxonomy, HowTo, SSH...",6.0,format/standard,,,16.0,(format/standard),,"ARCHE, DARIAH Campus, dha taxonomy, HowTo, SSH...",6.0,format/standard,,,16.0,(format/standard)
3742,xml schema,,,,format/standard,,,2.0,(format/standard),,,,format/standard,,,2.0,(format/standard)
3755,yearindh2008,,,,not categorizable,,,1.0,(not categorizable),,,,not categorizable,,,1.0,(not categorizable)
3759,youth,,,,topic,,,1.0,(topic),,,,topic,,,1.0,(topic)


In [5]:
#Löschen von Tags, die nur in ARCHE vorkommen

df = df[df['Vocabularies (excl. Zotero_DHA)'] != 'ARCHE']
df = df[df['Vocabularies (excl. Zotero_DHA)'] != 'ARCHE / ARCHE']
print(len(df))

2900


In [6]:
#Wieviele und welche Tags sind nicht von Martin/Nina kategorisiert worden? 
#ARCHE-Tags ausgeschlossen

empty_category_tags_m = df[df['Category 1_x'].isna() | (df['Category 1_x'] == '')]['Tag']
print("Martin:")
print(len(empty_category_tags_m))
for x in empty_category_tags_m:
    print(x)
print()

empty_category_tags_n = df[df['Category 1_y'].isna() | (df['Category 1_y'] == '')]['Tag']
print("Nina:")
print(len(empty_category_tags_n))
for x in empty_category_tags_n:
    print(x)

Martin:
2
2001: a space odyssey (film)
journal of american history

Nina:
0


In [7]:
#Löschen von Tags, die von Martin nicht kategorisiert worden sind
cols_to_check = ['Category 1_x', 'Category 2_x', 'Category 3_x']
df = df.dropna(subset=cols_to_check, how='all')
print(len(df))

2898


In [8]:
#Löschen von Tags, die von Nina nicht kategorisiert worden sind
cols_to_check = ['Category 1_y', 'Category 2_y', 'Category 3_y']
df = df.dropna(subset=cols_to_check, how='all')
print(len(df))

2898


In [9]:
#Einblick: Wann kommt Topic zusätzlich zu einer anderen Kategorisierung vor?
#Topic als sekundäre Kategorie verstehen (z.B. object-type und entities sind automatisch immer auch topic)?

count_martin = 0
count_nina = 0

for x in df.index:
    tag = df.loc[x, "Tag"]
    rating1 = df.loc[x, "Categories_x"]
    rating2 = df.loc[x, "Categories_y"]
    if len(rating1) > 1 and "topic" in rating1:
        print("Martin:", tag, [x for x in rating1])
        count_martin = count_martin + 1
    if len(rating2) > 1 and "topic" in rating2:
        print("Nina:", tag, [x for x in rating2])
        count_nina = count_nina + 1

print()
print("Martin: Topic in", count_martin, "Fällen zusätzlich verwendet")
print("Nina: Topic in", count_nina, "Fällen zusätzlich verwendet")

Martin: theater ['topic', 'object-type', 'entity (place)']
Nina: antiquity ['topic', 'time period']
Martin: indigenous culture ['topic', 'discipline']
Martin: pandemic ['topic', 'entity (event)']
Martin: 18th century literature ['topic', 'time period']
Martin: 19th century literature ['topic', 'time period']
Martin: african languages ['topic', 'language']
Martin: church ['topic', 'entity (place)']
Martin: earth observations ['topic', 'method/activity']
Nina: earth observations ['topic', 'method/activity']
Martin: emigrants ['topic', 'entity (person)']
Martin: ethnic minorities ['topic', 'entity (person)']
Martin: immigrants ['topic', 'entity (person)']
Martin: place names ['topic', 'entity (place)']
Martin: prison ['topic', 'object-type', 'entity (place)']
Martin: syllables ['topic', 'language']
Martin: thermal effects ['topic', 'object-type']
Martin: typos ['topic', 'object-type']
Martin: voting ['topic', 'method/activity']
Martin: text messaging ['topic', 'method/activity']
Martin: a

Martin: literary plot ['topic', 'object-type']
Martin: literary prizes ['topic', 'object-type']
Martin: media format ['topic', 'format/standard']
Martin: mesh ['topic', 'format/standard']
Martin: metadata search ['topic', 'method/activity']
Nina: muslim history ['topic', 'discipline']
Martin: open research ['topic', 'method/activity']
Martin: peer review ['topic', 'method/activity']
Nina: peer review ['topic', 'method/activity']
Martin: plastics ['topic', 'object-type']
Martin: podcast ['topic', 'object-type']
Martin: point cloud ['topic', 'object-type']
Nina: point cloud ['topic', 'object-type']
Martin: policies ['topic', 'object-type']
Martin: production ['topic', 'method/activity']
Nina: proper name disambiguation ['topic', 'method/activity']
Nina: race films ['topic', 'object-type']
Nina: random forest ['topic', 'method/activity']
Nina: tbx in tei ['topic', 'format/standard']
Nina: web3d ['topic', 'entity (institution/organisation/brand)']
Martin: workplace ['topic', 'entity (place

### Insight into individual rater behaviour

In [10]:
# Flatten the frozensets and count occurrences
counter = Counter([item for subset in df['Categories_x'] for item in subset])

# Separate the keys and values for plotting
items, counts = zip(*sorted(counter.items(), key=lambda x: x[0]))

# Plot
fig = go.Figure(data=[
    go.Bar(x=items, y=counts, marker_color='skyblue')
])

fig.update_layout(
    title='Categorizations by Martin',
    xaxis_title='Element',
    yaxis_title='Count',
    showlegend=False,
    template="plotly_white"
)

fig.show()

In [11]:
# Flatten the frozensets and count occurrences
counter = Counter([item for subset in df['Categories_y'] for item in subset])

# Separate the keys and values for plotting
items, counts = zip(*sorted(counter.items(), key=lambda x: x[0]))

# Plot
fig = go.Figure(data=[
    go.Bar(x=items, y=counts, marker_color='skyblue')
])

fig.update_layout(
    title='Categorizations by Nina',
    xaxis_title='Element',
    yaxis_title='Count',
    showlegend=False,
    template="plotly_white"
)

fig.show()

In [10]:
# Flatten the frozensets and count occurrences - DHA-Zotero only
counter = Counter([item for subset in df_dha['Categories_x'] for item in subset])

# Separate the keys and values for plotting
items, counts = zip(*sorted(counter.items(), key=lambda x: x[0]))

# Plot
fig = go.Figure(data=[
    go.Bar(x=items, y=counts, marker_color='skyblue')
])

fig.update_layout(
    title='Categorizations by Martin',
    xaxis_title='Element',
    yaxis_title='Count',
    showlegend=False,
    template="plotly_white"
)

fig.show()

In [9]:
# Flatten the frozensets and count occurrences - DHA-Zotero only
counter = Counter([item for subset in df_dha['Categories_y'] for item in subset])

# Separate the keys and values for plotting
items, counts = zip(*sorted(counter.items(), key=lambda x: x[0]))

# Plot
fig = go.Figure(data=[
    go.Bar(x=items, y=counts, marker_color='skyblue')
])

fig.update_layout(
    title='Categorizations by Nina',
    xaxis_title='Element',
    yaxis_title='Count',
    showlegend=False,
    template="plotly_white"
)

fig.show()

### Calculating Cohen's Kappa Coefficient

**Cohen's Kappa coefficient**

Notes on interpretation:
- 1 = perfect agreement
- 0.6-0.8 = good agreement
- 0.4-0.6 = moderate agreement
- 0.2-0.4 = fair agreement
- 0-0.2 = slight agreement
- 0 = what would be expected by chance, i.e. no actual agreement
- < 0 = worse than what would be expected by chance

Notes on advantages and limitations:
- accounts for change agreement, thus more robust
- for categorical data
- assumes that categories are mutually exclusive and exhaustive

In [12]:
df['Categories_rater1_str'] = df['Categories_x'].astype(str)
df['Categories_rater2_str'] = df['Categories_y'].astype(str)
kappa = cohen_kappa_score(df['Categories_rater1_str'], df['Categories_rater2_str'])
print(kappa)

0.549439508014615


### Counting matches and mismatches (Category1 etc. are considered as single data points, i.e. one keyword can lead to one to three (mis)matches)

In [13]:
count_mismatch = 0
count_match = 0

for x in df.index:

    #hier finden sich die Ratings als frozensets (= Reihenfolge spielt keine Rolle)
    rating1 = df.loc[x, "Categories_x"]
    rating2 = df.loc[x, "Categories_y"]

    #Person, die mehr Tags vergeben hat, wird jeweils als Ausgangspunkt genommen
    if len(rating1) >= len(rating2):
        start_rating = rating1
        compare_rating = rating2
    else:
        start_rating = rating2
        compare_rating = rating1

    #Count für diese Zeile auf 0 setzen
    count_match_row = 0
    count_mismatch_row = 0

    #zählen der Matches und Mismatches
    for entry in start_rating:
        if entry in compare_rating:
            count_match_row = count_match_row + 1
        else:
            count_mismatch_row = count_mismatch_row + 1
    
    #in eigener Spalte speichern
    df.loc[x, "Matches"] = count_match_row
    df.loc[x, "Mismatches"] = count_mismatch_row

    #zum Gesamtcount hinzufügen
    count_match = count_match + count_match_row
    count_mismatch = count_mismatch + count_mismatch_row

print("Matches: " + str(count_match) + " (" + str(round((count_match * 100)/(count_match + count_mismatch))) + "%)")
print("Mismatches: " + str(count_mismatch) + " (" + str(round((count_mismatch * 100)/(count_match + count_mismatch))) + "%)")


Matches: 2183 (62%)
Mismatches: 1333 (38%)


### Taking a closer look at the mismatches

In [14]:
for x in df.index:
    if df.loc[x, "Mismatches"] > 0:
        print(df.loc[x, "Tag"], [x for x in df.loc[x, "Categories_x"]], [x for x in df.loc[x, "Categories_y"]])

visual culture ['discipline'] ['topic']
writings on music ['object-type'] ['topic']
english as a lingua franca ['language'] ['topic']
spelling ['method/activity'] ['topic']
theater ['topic', 'object-type', 'entity (place)'] ['topic']
antiquity ['object-type', 'time period'] ['topic', 'time period']
burials ['object-type', 'entity (event)'] ['topic']
code mixing ['method/activity'] ['topic']
indigenous culture ['topic', 'discipline'] ['topic']
pandemic ['topic', 'entity (event)'] ['topic']
seals ['not categorizable'] ['object-type']
18th century literature ['topic', 'time period'] ['object-type']
19th century literature ['topic', 'time period'] ['object-type']
absorption ['method/activity'] ['topic']
african languages ['topic', 'language'] ['topic']
airbnb ['resource/tool'] ['entity (institution/organisation/brand)', 'resource/tool']
ancient rome ['entity (place)', 'time period'] ['topic']
ancient world ['entity (place)', 'time period'] ['topic']
anglo saxons ['entity (person)'] ['topic

### Taking a closer look at individual categories (only matches considered)

In [15]:
def show_matching_entities(category):
    print("Kategorie:", category)
    print()
    entities = []
    for x in df.index:
        if category in df.loc[x, "Categories_x"] and category in df.loc[x, "Categories_y"]:
            entities.append(df.loc[x, "Tag"])
    for x in sorted(entities):
        print(x)

In [21]:
show_matching_entities("discipline")

Kategorie: discipline

550 geowissenschaften
african american history
african history
alte geschichte
american history
amerikaanse letterkunde
ancient history
anthropologie
anthropology
applied humanities
archaeology
archeology
architectural history
architecture
architektur
architektur,
archival studies
archäologie
art history
art science
art studies
arts
astronomy
austrian music studies
bauforschung
bioarchaeology
biological sciences
biology
book history
bulgarian history
business
canadian history
ceramic studies
chemistry
codicology
communication sciences
communication studies
computational linguistics
computational social science
computer science
computer sciences
computerlinguistik
contemporary history
corpus linguistics
cscw
culinary history
cultural history
cultural studies
culture studies
data science
dh
dialectology
didactics
didaktik
digital diplomatics
digital forensic science
digital history
digital hmanities
digital humanism
digital humanities
digital lexicography
digital m

### Comparing different categories in terms of (mis)matches

In [17]:
def count_match(category):
    count = 0
    for x in df.index:
        if category in df.loc[x, "Categories_x"] and category in df.loc[x, "Categories_y"]:
            count = count + 1
    return count

def count_mismatch(category):
    count = 0
    for x in df.index:
        if category in df.loc[x, "Categories_x"] and category not in df.loc[x, "Categories_y"]:
            count = count + 1
        elif category not in df.loc[x, "Categories_x"] and category in df.loc[x, "Categories_y"]:
            count = count + 1
    return count

def calculate_agreement(category):
    matches = count_match(category)
    mismatches = count_mismatch(category)
    rate = matches*100/(matches+mismatches)
    return rate

def print_agreement(category):
    rate = calculate_agreement(category)
    print(round(rate, 2), "%")

In [18]:
categories = ["discipline", "entity (event)", "entity (institution/organisation/brand)", "entity (person)", 
              "entity (place)", "entity (product)", "entity (project)", "format/standard", "language", "method/activity", 
              "multiple tags", "not categorizable", "object-type", "resource/tool", "time period", "topic"]

for category in categories:
    print(category)
    print_agreement(category)
    print()

discipline
57.18 %

entity (event)
27.59 %

entity (institution/organisation/brand)
41.07 %

entity (person)
52.1 %

entity (place)
62.58 %

entity (product)
36.36 %

entity (project)
13.33 %

format/standard
43.48 %

language
76.56 %

method/activity
68.6 %

multiple tags
75.13 %

not categorizable
27.65 %

object-type
47.06 %

resource/tool
36.62 %

time period
39.66 %

topic
44.17 %



In [19]:
agreement_rates = []

for category in categories:
    agreement_rates.append(calculate_agreement(category))

# Plotting the agreement rates using Plotly
fig = go.Figure(data=[
    go.Bar(name='Agreement Rates', x=categories, y=agreement_rates)
])

fig.update_layout(
    title='Agreement Rates by Category',
    xaxis_title='Category',
    yaxis_title='Agreement Rate (%)',
    xaxis_tickangle=-45,
    yaxis=dict(range=[0, 100]) #0-100 % on y-axis
)

fig.show()

### Exporting the merged dataframe

In [20]:
#df.to_excel('Keyword_Categorisation_merged_18.08.2023.xlsx', index=False, encoding="utf-8")