As shown in `fill_quota.ipynb`, the theses of all three repositories contain referees, whereas only depositonce has a maintained list of advisors. Therefore, referees could be used as a substitute for venues.

In [11]:
import json
from matplotlib import pyplot as plt
from collections import Counter
import re

In [2]:
tu = json.load(open('../../../data/processed/dim/depositonce.json'))
hu = json.load(open('../../../data/processed/dim/edoc.json'))
fu = json.load(open('../../../data/processed/dim/refubium.json'))

In [3]:
advisors = {'TU': {'total': 0, 'distinct': 0}, 'HU': {'total': 0, 'distinct': 0}, 'FU': {'total': 0, 'distinct': 0}}
seen_advisors = {'TU': [], 'HU': [], 'FU': []}
nulls = {'TU': 0, 'HU': 0, 'FU': 0}
totals = {'TU': 0, 'HU': 0, 'FU': 0}
repos = ['TU', 'HU', 'FU']
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        if doc['type'][1] == 'thesis':
            totals[repos[i]] += 1
            has_advisor = False
            for author in doc['authors']:
                if author[1] == 'advisor':
                    has_advisor = True
                    advisors[repos[i]]['total'] += 1
                    if author[0] not in seen_advisors[repos[i]]:
                        advisors[repos[i]]['distinct'] += 1
                        seen_advisors[repos[i]].append(author[0])
            if not has_advisor:
                nulls[repos[i]] += 1

In [4]:
for repo in advisors:
    print(f'{repo} has {advisors[repo]["total"]} advisors, {advisors[repo]["distinct"]} distinct ones. {nulls[repo]} documents do not have an advisor ({round(nulls[repo]/totals[repo], 2)}).')

TU has 3603 advisors, 1080 distinct ones. 124 documents do not have an advisor (0.04).
HU has 19 advisors, 13 distinct ones. 2659 documents do not have an advisor (0.99).
FU has 0 advisors, 0 distinct ones. 4815 documents do not have an advisor (1.0).


What is the avg. number of advisors per thesis?

In [12]:
advisors_per_doc = {'TU': [], 'HU': [], 'FU': []}
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        cnt = 0
        for author in doc['authors']:
            if author[1] == 'advisor' and re.match('N.[\s]?N[\.]?', author[0]) is None:
                cnt += 1
        if cnt > 0:
            advisors_per_doc[repos[i]].append(cnt)

In [15]:
for repo in advisors_per_doc:
    if len(advisors_per_doc[repo]) != 0:
        print(repo, sum(advisors_per_doc[repo]) / len(advisors_per_doc[repo]))
    else:
        print(repo, 0)

TU 1.1401898734177216
HU 1.2666666666666666
FU 0


Check in how many documents each author occurs.

In [5]:
people = {'TU': {}, 'HU': {}, 'FU': {}}
for i, repo in enumerate([tu, hu, fu]):
    for doc in repo:
        if doc['type'][1] == 'thesis':
            for author in doc['authors']:
                if author[1] == 'advisor':
                    if author[0] in people[repos[i]]:
                        people[repos[i]][author[0]] += 1
                    else:
                        people[repos[i]][author[0]] = 1

In [6]:
for repo in people:
    if len(people[repo]) > 0:
        print(f'{repo} avg.: {round(sum(people[repo].values())/len(people[repo]), 2)}')
    else:
        print(f'{repo} avg.: 0')

TU avg.: 3.34
HU avg.: 1.46
FU avg.: 0


In [7]:
sorted_people = {'TU': {}, 'HU': {}, 'FU': {}}
for repo in sorted_people:
    sorted_people[repo] = {person: n for person, n in sorted(people[repo].items(), key=lambda item: item[1], reverse=True)}

In [8]:
for repo in sorted_people:
    print(repo)
    cnt = 0
    for key in sorted_people[repo]:
        print(key, sorted_people[repo][key])
        cnt += 1
        if cnt == 5:
            print()
            break

TU
Müller, Klaus-Robert 47
Lauster, Roland 45
Obermayer, Klaus 44
Schlögl, Robert 44
Schomäcker, Reinhard 41

HU
Härdle, Wolfgang 6
Rönz, Bernd 2
RR 54663 1
Schulz, Franziska 1
Klinke, Sigbert 1

FU


In [16]:
for repo in sorted_people:
    cnt = 0
    for key, value in sorted_people[repo].items():
        if value > 10:
            cnt += 1
        else:
            print(repo, cnt)
            break


TU 84
HU 0
