Analyse the documents that were retrieved from OpenAlex.

In [1]:
import json
from os import listdir
from collections import Counter
from matplotlib import pyplot as plt

In [2]:
docs_folder = '../data/openalex/docs'
subjects_file = '../data/openalex/subjects.json'
subjects = json.load(open(subjects_file))

In [3]:
doc_cnt = {}  # map subjects to their number of docs
for file in listdir(docs_folder):
  docs = json.load(open(f'{docs_folder}/{file}'))
  for subject in docs:
    doc_cnt[subject] = len(docs[subject])
len(doc_cnt)

2157

In [4]:
sum(doc_cnt.values())  # total no. of docs

214538

How many documents don't have a field assigned to them?

In [5]:
no_field = 0
for file in listdir(docs_folder):
  docs = json.load(open(f'{docs_folder}/{file}'))
  for group in docs.values():
    for doc in group:
      has_field = False
      for subject in doc['subjects'].keys():
        if subject in subjects:
          if subjects[subject]['level'] == 0:
            has_field = True
            break
      if not has_field:
        no_field += 1
no_field

1952

Count the occurrence of subject sets of docs that don't have fields assigned to them

In [6]:
nofield_cnt = Counter()
for file in listdir(docs_folder):
  docs = json.load(open(f'{docs_folder}/{file}'))
  for group in docs.values():
    for doc in group:
      has_field = False
      for subject in doc['subjects'].keys():
        if subject in subjects:
          if subjects[subject]['level'] == 0:
            has_field = True
            break
      if not has_field:
        s_names = [subjects[s]['name'] for s in subjects if s in doc['subjects'].keys()]
        nofield_cnt[str(sorted(s_names))] += 1
nofield_cnt.most_common(10)

[("['Humanities']", 11),
 ("['Croatian']", 8),
 ("['Form of the Good']", 8),
 ("['San Joaquin']", 7),
 ("['Table (database)']", 6),
 ("['Crew']", 6),
 ("['Analogy']", 6),
 ("['Variety (cybernetics)']", 5),
 ("['Division (mathematics)']", 5),
 ("['Craft']", 5)]

How many of the documents without assigned fields have Mechanics assigned to them?

In [7]:
mechanics_cnt = 0
for assigned, cnt in nofield_cnt.items():
  if "'Mechanics'" in assigned:
    mechanics_cnt += cnt
mechanics_cnt

122

Basic stats

In [8]:
sum(nofield_cnt.values()) / len(nofield_cnt)  # avg. no. of subjects per doc that does not have a field

1.2338811630847029

In [9]:
sum([v == 1 for v in nofield_cnt.values()])  # no. of subject sets that occur only once (from those of docs that are not assigned fields)

1362

In [10]:
sum([',' not in k for k in nofield_cnt.keys()]) 

427

In [11]:
len(nofield_cnt)

1582

Are there duplicates?

In [12]:
duplicate_cnt = Counter()
for file in listdir(docs_folder):
  docs = json.load(open(f'{docs_folder}/{file}'))
  for group in docs.values():
    for doc in group:
      duplicate_cnt[str(doc['data'])] += 1
duplicate_cnt.most_common(5)

[('[]', 122),
 ("['meet', 'editorial', 'board', 'member']", 16),
 ("['appeal', 'editor-in-chief', 'abstract']", 15),
 ("['spin', 'crossover', 'cobalt', 'complex', 'co-based', 'complex', 'pypz', 'pyridine', 'pyrazole', 'deposit', 'investigate', 'scan', 'tunneling', 'microscopy', 'tridentate', 'coordination', 'sphere', 'molecule', 'aggregate', 'mainly', 'tetramers', 'individual', 'complex', 'tetramers', 'undergo', 'reversible', 'transition', 'state', 'characteristic', 'image', 'contrast', 'current', 'pass', 'neighbor', 'molecule', 'exhibit', 'bistability', 'molecule', 'stable', 'transition', 'rate', 'vary', 'linearly', 'tunnel', 'current', 'exhibit', 'intriguing', 'dependence', 'bias', 'voltage', 'polarity', 'interpret', 'state', 'spin', 'state', 'complex', 'image', 'contrast', 'orders-of-magnitude', 'variation', 'switching', 'yield', 'tentatively', 'understand', 'calculate', 'orbital', 'structure', 'spin', 'state', 'provide', 'insight', 'mechanism', 'electron-induced', 'excited', 'spin-

In [13]:
summ = sum([v for v in duplicate_cnt.values() if v > 1])
summ # total number of duplicates

4849

In [14]:
lenn = len([v for v in duplicate_cnt.values() if v > 1])
lenn# number of duplicated docs

2346

In [15]:
summ/lenn  # avg. number of duplicates per distinct doc

2.06692242114237

How many distinct documents are there?

In [16]:
len(duplicate_cnt)

212035