Look at the structure of the references extracted by `extract_references.py`

In [1]:
import json
import random

In [2]:
hu_refs = json.load(open('../data/json/references/edoc.json'))
tu_refs = json.load(open('../data/json/references/depositonce.json'))
fu_refs = json.load(open('../data/json/references/refubium.json'))

In [3]:
hu = json.load(open('../data/json/dim/edoc/relevant_data.json'))
tu = json.load(open('../data/json/dim/depositonce/relevant_data.json'))
fu = json.load(open('../data/json/dim/refubium/relevant_data.json'))

In [4]:
len(hu_refs), len(tu_refs), len(fu_refs), len(hu_refs) + len(tu_refs) + len(fu_refs)

(7447, 7433, 14449, 29329)

How many documents have references, in total and per repository?

In [5]:
print(f'TU: {len(tu_refs)} of {len(tu)} docs have references ({round(len(tu_refs)/len(tu), 2)})')
print(f'HU: {len(hu_refs)} of {len(hu)} docs have references ({round(len(hu_refs)/len(hu), 2)})')
print(f'TU: {len(fu_refs)} of {len(fu)} docs have references ({round(len(fu_refs)/len(fu), 2)})')

TU: 7433 of 7438 docs have references (1.0)
HU: 7447 of 7497 docs have references (0.99)
TU: 14449 of 14464 docs have references (1.0)


In [6]:
total_refs_cnt = len(tu_refs) + len(hu_refs)+ len(fu_refs)
print(f'Total: {total_refs_cnt} of {len(tu)+len(hu)+len(fu)} docs have references ({round(total_refs_cnt/(len(tu)+len(hu)+len(fu)), 2)})')

Total: 29329 of 29399 docs have references (1.0)


How many references are there on average, per repository and in total?

In [7]:
hu_total, tu_total, fu_total = 0, 0, 0
for refs in hu_refs.values():
  hu_total += len(refs)
for refs in tu_refs.values():
  tu_total += len(refs)
for refs in fu_refs.values():
  fu_total += len(refs)
print(f'HU avg.: {round(hu_total/len(hu_refs), 2)}')
print(f'TU avg.: {round(tu_total/len(tu_refs), 2)}')
print(f'FU avg.: {round(fu_total/len(fu_refs), 2)}')
total = hu_total + tu_total + fu_total
print(f'Total avg.: {round(total/(len(hu_refs)+len(tu_refs)+len(fu_refs)), 2)}')


HU avg.: 158.62
TU avg.: 151.7
FU avg.: 158.91
Total avg.: 157.01


That seems like a lot. Theses are surely to blame for these large averages.

In [8]:
hu_types = json.load(open('../data/json/dim/edoc/relevant_types.json'))
tu_types = json.load(open('../data/json/dim/depositonce/relevant_types.json'))
fu_types = json.load(open('../data/json/dim/refubium/relevant_types.json'))
hu_theses, tu_theses, fu_theses = [], [], []
hu_publications, tu_publications, fu_publications = [], [], []
for id in hu_refs:
  refs = hu_refs[id]
  doc_type = hu_types[id]
  if 'thesis' in doc_type:
    hu_theses.append(len(refs))
  else:
    hu_publications.append(len(refs))
for id in tu_refs:
  refs = tu_refs[id]
  doc_type = tu_types[id]
  if 'thesis' in doc_type:
    tu_theses.append(len(refs))
  else:
    tu_publications.append(len(refs))
for id in fu_refs:
  refs = fu_refs[id]
  doc_type = fu_types[id]
  if 'thesis' in doc_type:
    fu_theses.append(len(refs))
  else:
    fu_publications.append(len(refs))
print('Theses')
print(f'HU avg.: {round(sum(hu_theses)/len(hu_theses), 2)}')
print(f'TU avg.: {round(sum(tu_theses)/len(tu_theses), 2)}')
print(f'FU avg.: {round(sum(fu_theses)/len(fu_theses), 2)}')
print(f'Total avg.: {round((sum(hu_theses)+sum(tu_theses)+sum(fu_theses))/(len(hu_theses)+len(tu_theses)+len(fu_theses)), 2)}')
print('Publications')
print(f'HU avg.: {round(sum(hu_publications)/len(hu_publications), 2)}')
print(f'TU avg.: {round(sum(tu_publications)/len(tu_publications), 2)}')
print(f'FU avg.: {round(sum(fu_publications)/len(fu_publications), 2)}')
print(f'Total avg.: {round((sum(hu_publications)+sum(tu_publications)+sum(fu_publications))/(len(hu_publications)+len(tu_publications)+len(fu_publications)), 2)}')


Theses
HU avg.: 291.45
TU avg.: 227.66
FU avg.: 234.56
Total avg.: 246.54
Publications
HU avg.: 84.86
TU avg.: 91.94
FU avg.: 121.16
Total avg.: 105.27


Dump the Ids for which there are no references.

In [9]:
missing = {'depositonce': [], 'edoc': [], 'refubium': []}
for doc_id in tu:
  if doc_id not in tu_refs.keys():
    missing['depositonce'].append(doc_id)
for doc_id in hu:
  if doc_id not in hu_refs.keys():
    missing['edoc'].append(doc_id)
for doc_id in fu:
  if doc_id not in fu_refs.keys():
    missing['refubium'].append(doc_id)
# json.dump(missing, open('../data/json/references/missing.json', 'w'))

What types do these documents belong to?

In [12]:
types = {
  'edoc': json.load(open('../data/json/dim/edoc/relevant_types.json')),
  'depositonce': json.load(open('../data/json/dim/depositonce/relevant_types.json')),
  'refubium':  json.load(open('../data/json/dim/refubium/relevant_types.json'))
}

In [13]:
missing_types = {}
for repo in missing.keys():
  for id in missing[repo]:
    doc_type = types[repo][id]
    if doc_type in missing_types:
      missing_types[doc_type] += 1
    else:
      missing_types[doc_type] = 1
missing_types

{'doctoralthesis': 18,
 'conferenceobject': 8,
 'article': 29,
 'masterthesis': 5,
 'book': 4,
 'bookpart': 4,
 'workingpaper': 2}

In [16]:
print(random.sample(missing['depositonce'], 1))
print(random.sample(missing['edoc'], 3))
print(random.sample(missing['refubium'], 1))

['oai:depositonce.tu-berlin.de:11303/5131']
['oai:edoc.hu-berlin.de:18452/20419', 'oai:edoc.hu-berlin.de:18452/20665', 'oai:edoc.hu-berlin.de:18452/22629']
['oai:refubium.fu-berlin.de:fub188/19481']
