Here I want to analyse how many subjects there are per language, identify how many documents have the same subject set in both languages by checking if a document has the same number of subjects for each language.

In [42]:
import json
from matplotlib import pyplot as plt

In [43]:
repo = 'refubium'
docs = json.load(open(f'../../data/json/dim/{repo}/subjects.json'))
subjects = json.load(open(f'../../data/json/dim/{repo}/subjects_reversed.json'))

In [44]:
print('# of subjects per language')
total = 0
for key in subjects:
    total += len(subjects[key])
    print(f'{key}: {len(subjects[key])}')
print(f'Total: {total}')

# of subjects per language
de: 3289
en: 23068
other: 579
unknown: 51419
Total: 78355


In [45]:
def cnt_docs(doc, lang):
    cnt = 0
    for subject_type in doc:
        cnt += len(doc[subject_type][lang])
    return cnt

In [46]:
cnt = 0
for id in docs:
    if cnt_docs(docs[id], 'de') == cnt_docs(docs[id], 'en'):
        cnt += 1
print(f'Out of the {len(docs)} documents, {cnt} of them have the same number of english subjects as they have german subjects. A probable cause of this is that ones are the translations of the others.')

Out of the 28720 documents, 21701 of them have the same number of english subjects as they have german subjects. A probable cause of this is that ones are the translations of the others.


In [50]:
def extract_numbers(text):
    if text is None:
        return []
    numbers = []
    n = ''
    for char in text:
        if char.isdigit():
            n += char
        elif len(n) > 0:
            if char == '.':
                n += char
            else:
                numbers.append(n)
                n = ''
    if len(n) > 0:
        numbers.append(n)
        n = ''
    return numbers

In [51]:
def count_ddc(data):
    at_least, total, cnt_0, cnt_00, cnt = 0, 0, 0, 0, 0
    for id in data:
        ddcs = sum(data[id]['ddc'].values(), [])
        at_least += len(ddcs) > 0
        total += len(ddcs)
        for ddc in ddcs:
            for n in extract_numbers(ddc):
                if n[-2:] == '00':
                    cnt_00 += 1
                elif n[-1:] == '0':
                    cnt_0 += 1
                else:
                    cnt += 1
    print(f'Out of the {len(data)} documents, {at_least} of them have at least one DDC subject ({round(100*at_least/len(data), 2)} %).')
    print(f'In total there are {total} DDC subjects, resulting in an avg. of {round(total/len(data), 2)} per document.')
    print(f'{cnt_00} of the subjects (with duplicates) end in 00')
    print(f'{cnt_0} of the subjects (with duplicates) end in 0')
    print(f'{cnt} of the subjects (with duplicates) are more specific than that')
    return (at_least, total, cnt_0, cnt_00, cnt)

In [52]:
count_ddc(docs)

Out of the 28720 documents, 28664 of them have at least one DDC subject (99.81 %).
In total there are 32620 DDC subjects, resulting in an avg. of 1.14 per document.
2387 of the subjects (with duplicates) end in 00
23612 of the subjects (with duplicates) end in 0
6620 of the subjects (with duplicates) are more specific than that


(28664, 32620, 23612, 2387, 6620)

Now I'll look only at theses and publications written in english.

In [53]:
relevant_ids = json.load(open(f'../../data/json/dim/{repo}/relevant.json'))
relevant_docs = {k: v for k, v in docs.items() if k in relevant_ids}
len(docs), len(relevant_docs)

(28720, 14464)

In [54]:
at_least, total, cnt_0, cnt_00, cnt = count_ddc(relevant_docs)

Out of the 14464 documents, 14424 of them have at least one DDC subject (99.72 %).
In total there are 17013 DDC subjects, resulting in an avg. of 1.18 per document.
1276 of the subjects (with duplicates) end in 00
11063 of the subjects (with duplicates) end in 0
4674 of the subjects (with duplicates) are more specific than that


In [55]:
facts_relevant = {
    'n_all_docs': len(docs),
    'n_docs': len(relevant_docs),
    'at_least': at_least,
    'cnt_0': cnt_0,
    'cnt_00': cnt_00,
    'cnt': cnt
}
# json.dump(facts_relevant, open(f'relevant_facts_{repo}.json', 'w'))