As shown in `fill_quota.ipynb`, the theses of all three repositories contain referees, whereas only depositonce has a maintained list of advisors. Therefore, referees could be used as a substitute for venues.

In [26]:
import json
from matplotlib import pyplot as plt
from collections import Counter

In [27]:
tu = json.load(open('../../../data/processed/dim/depositonce.json'))
hu = json.load(open('../../../data/processed/dim/edoc.json'))
fu = json.load(open('../../../data/processed/dim/refubium.json'))

In [28]:
authors = {'TU': {'total': 0, 'distinct': 0}, 'HU': {'total': 0, 'distinct': 0}, 'FU': {'total': 0, 'distinct': 0}}
seen_authors = {'TU': [], 'HU': [], 'FU': []}
nulls = {'TU': 0, 'HU': 0, 'FU': 0}
totals = {'TU': 0, 'HU': 0, 'FU': 0}
repos = ['TU', 'HU', 'FU']
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        if doc['type'][1] == 'publication':
            totals[repos[i]] += 1
            has_author = False
            for author in doc['authors']:
                if author[1] == 'author':
                    has_author = True
                    authors[repos[i]]['total'] += 1
                    if author[0] not in seen_authors[repos[i]]:
                        authors[repos[i]]['distinct'] += 1
                        seen_authors[repos[i]].append(author[0])
            if not has_author:
                nulls[repos[i]] += 1

In [29]:
for repo in authors:
    print(f'{repo} has {authors[repo]["total"]} authors, {authors[repo]["distinct"]} distinct ones. {nulls[repo]} documents do not have an author ({round(nulls[repo]/totals[repo], 2)}).')

TU has 18421 authors, 10581 distinct ones. 63 documents do not have an author (0.02).
HU has 12872 authors, 7778 distinct ones. 0 documents do not have an author (0.0).
FU has 52997 authors, 32817 distinct ones. 144 documents do not have an author (0.01).


Check in how many documents each author occurs.

In [30]:
people = {'TU': {}, 'HU': {}, 'FU': {}}
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        if doc['type'][1] == 'publication':
            for author in doc['authors']:
                if author[1] == 'author':
                    if author[0] in people[repos[i]]:
                        people[repos[i]][author[0]] += 1
                    else:
                        people[repos[i]][author[0]] = 1

In [31]:
for repo in people:
    if len(people[repo]) > 0:
        print(f'{repo} avg.: {round(sum(people[repo].values())/len(people[repo]), 2)}')
    else:
        print(f'{repo} avg.: 0')

TU avg.: 1.74
HU avg.: 1.65
FU avg.: 1.61


In [32]:
sorted_people = {'TU': {}, 'HU': {}, 'FU': {}}
for repo in sorted_people:
    sorted_people[repo] = {person: n for person, n in sorted(people[repo].items(), key=lambda item: item[1], reverse=True)}

In [33]:
for repo in sorted_people:
    print(repo)
    cnt = 0
    for key in sorted_people[repo]:
        print(key, sorted_people[repo][key])
        cnt += 1
        if cnt == 5:
            print()
            break

TU
Nagel, Kai 143
Juurlink, Ben 73
Popov, Valentin L. 66
Neubauer, Peter 56
Finkbeiner, Matthias 48

HU
Härdle, Wolfgang 143
Geiser, Jürgen 61
Härdle, Wolfgang Karl 60
Seadle, Michael 57
Güth, Werner 45

FU
Haag, Rainer 92
Eisert, Jens 73
Rillig, Matthias C. 71
Netz, Roland R. 65
Paul, Friedemann 56



In [36]:
multiple_authors = {'TU': 0, 'HU': 0, 'FU': 0}
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        if doc['type'][1] == 'publication':
            cnt = 0
            for author in doc['authors']:
                if author[1] == 'author':
                    cnt += 1
            if cnt > 1:
                multiple_authors[repos[i]] += 1

In [37]:
multiple_authors, totals

({'TU': 3699, 'HU': 3065, 'FU': 8117},
 {'TU': 4154, 'HU': 4823, 'FU': 9649},
 18626)

In [38]:
sum(multiple_authors.values()), sum(multiple_authors.values()) / sum(totals.values())

(14881, 0.7989369698271234)