As shown in `fill_quota.ipynb`, the theses of all three repositories contain referees, whereas only depositonce has a maintained list of advisors. Therefore, referees could be used as a substitute for venues.

In [12]:
import json
from matplotlib import pyplot as plt
from collections import Counter

In [13]:
tu = json.load(open('../../../data/processed/dim/depositonce.json'))
hu = json.load(open('../../../data/processed/dim/edoc.json'))
fu = json.load(open('../../../data/processed/dim/refubium.json'))

In [14]:
authors = {'TU': {'total': 0, 'distinct': 0}, 'HU': {'total': 0, 'distinct': 0}, 'FU': {'total': 0, 'distinct': 0}}
seen_authors = {'TU': [], 'HU': [], 'FU': []}
nulls = {'TU': 0, 'HU': 0, 'FU': 0}
totals = {'TU': 0, 'HU': 0, 'FU': 0}
repos = ['TU', 'HU', 'FU']
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        if doc['type'][1] == 'thesis':
            totals[repos[i]] += 1
            has_author = False
            for author in doc['authors']:
                if author[1] == 'author':
                    has_author = True
                    authors[repos[i]]['total'] += 1
                    if author[0] not in seen_authors[repos[i]]:
                        authors[repos[i]]['distinct'] += 1
                        seen_authors[repos[i]].append(author[0])
            if not has_author:
                nulls[repos[i]] += 1

In [15]:
for repo in authors:
    print(f'{repo} has {authors[repo]["total"]} authors, {authors[repo]["distinct"]} distinct ones. {nulls[repo]} documents do not have an author ({round(nulls[repo]/totals[repo], 2)}).')

TU has 3284 authors, 3277 distinct ones. 0 documents do not have an author (0.0).
HU has 2676 authors, 2655 distinct ones. 0 documents do not have an author (0.0).
FU has 4815 authors, 4785 distinct ones. 0 documents do not have an author (0.0).


Check in how many documents each author occurs.

In [16]:
people = {'TU': {}, 'HU': {}, 'FU': {}}
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        if doc['type'][1] == 'thesis':
            for author in doc['authors']:
                if author[1] == 'author':
                    if author[0] in people[repos[i]]:
                        people[repos[i]][author[0]] += 1
                    else:
                        people[repos[i]][author[0]] = 1

In [17]:
for repo in people:
    if len(people[repo]) > 0:
        print(f'{repo} avg.: {round(sum(people[repo].values())/len(people[repo]), 2)}')
    else:
        print(f'{repo} avg.: 0')

TU avg.: 1.0
HU avg.: 1.01
FU avg.: 1.01


In [18]:
sorted_people = {'TU': {}, 'HU': {}, 'FU': {}}
for repo in sorted_people:
    sorted_people[repo] = {person: n for person, n in sorted(people[repo].items(), key=lambda item: item[1], reverse=True)}

In [19]:
for repo in sorted_people:
    print(repo)
    cnt = 0
    for key in sorted_people[repo]:
        print(key, sorted_people[repo][key])
        cnt += 1
        if cnt == 5:
            print()
            break

TU
Frank, Benjamin 2
Malissiovas, Georgios 2
Mohammed, Nabilah Adel 2
Mao, Lei 2
Woditsch, Richard 2

HU
Cabrera, Brenda López 3
Weilandt, Martin 2
Strobach, Tilo 2
Detlefsen, Kai 2
Borak, Szymon 2

FU
Li, Yan 3
Rost, Thomas 2
Lang, Annemarie 2
Fischer, Martina 2
Müller, Stefanie 2



Theses with more than 1 author.

In [20]:
multiple_authors = {'TU': 0, 'HU': 0, 'FU': 0}
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        if doc['type'][1] == 'thesis':
            cnt = 0
            for author in doc['authors']:
                if author[1] == 'author':
                    cnt += 1
            if cnt > 1:
                multiple_authors[repos[i]] += 1

In [22]:
multiple_authors, totals, sum(totals.values())

({'TU': 0, 'HU': 2, 'FU': 0}, {'TU': 3284, 'HU': 2674, 'FU': 4815}, 10773)