In [1]:
import pandas as pd
df = pd.read_json("250k.docs.jsonl", lines=True)

In [2]:
df.columns

Index(['PaperId', 'Title', 'Rank', 'CitationCount', 'Doi', 'DocType',
       'BookTitle', 'Volume', 'Issue', 'FirstPage', 'LastPage',
       'PublishedDate', 'Publisher', 'Journal', 'Authors', 'FieldsOfStudy',
       'Urls', 'PdfUrl', 'Abstract'],
      dtype='object')

In [3]:
df.DocType.unique()

array(['Journal', 'Book', 'Patent', 'Conference', None], dtype=object)

In [4]:
papers = df[(df.DocType == "Journal") | (df.DocType == "Conference")]

In [5]:
papers.columns

Index(['PaperId', 'Title', 'Rank', 'CitationCount', 'Doi', 'DocType',
       'BookTitle', 'Volume', 'Issue', 'FirstPage', 'LastPage',
       'PublishedDate', 'Publisher', 'Journal', 'Authors', 'FieldsOfStudy',
       'Urls', 'PdfUrl', 'Abstract'],
      dtype='object')

In [6]:
papers = papers.drop(columns=["Urls", "PdfUrl", "Doi", "Volume", "Issue", "FirstPage", "LastPage", "BookTitle"])

In [7]:
papers.iloc[0].Authors

[{'Name': 'F. Motoyoshi', 'AuthorId': '2632896860', 'SequenceNumber': 1},
 {'Name': 'N. Oshima', 'AuthorId': '2708987407', 'SequenceNumber': 2}]

In [8]:
papers.iloc[0].FieldsOfStudy

[{'Name': 'tobacco mosaic virus', 'Level': 3},
 {'Name': 'virus', 'Level': 2},
 {'Name': 'protoplast', 'Level': 2},
 {'Name': 'virology', 'Level': 1},
 {'Name': 'botany', 'Level': 1},
 {'Name': 'biology', 'Level': 0}]

In [9]:
authors = set()
for i, row in papers.iterrows():
    for entry in row.Authors:
        authors.add(entry["AuthorId"])

In [10]:
len(authors)

533118

In [11]:
major_fields = set()
no_major_fields = set()
minor_fields = set()
topics = set()
problems = set()
for i, row in papers.iterrows():
    major_amount = 0
    for entry in row.FieldsOfStudy:
        if entry["Level"] == 0:
            major_fields.add(entry["Name"])
            major_amount += 1
        elif entry["Level"] == 1:
            minor_fields.add(entry["Name"])
        elif entry["Level"] == 2:
            topics.add(entry["Name"])
        else:
            problems.add(entry["Name"])
    no_major_fields.add(major_amount)

In [12]:
print(f"Major Fields: {len(major_fields)}, Amounts: {no_major_fields}")
print(f"Minor Fields: {len(minor_fields)}")
print(f"Topics: {len(topics)}")
print(f"Problems: {len(problems)}")

Major Fields: 19, Max Amount: {0, 1, 2, 3}
Minor Fields: 294
Topics: 28274
Problems: 49262


In [15]:
import numpy as np
majors = []
minors = []
journals = []
for i, row in papers.iterrows():
    major_fields = []
    minor_fields = []
    journal_entry = row.Journal
    for entry in row.FieldsOfStudy:
        level = entry["Level"]
        name = entry["Name"]
        if level == 0:
            major_fields.append(name)
        elif level == 1:
            minor_fields.append(name)
    majors.append(major_fields)
    minors.append(minor_fields)
    journals.append(journal_entry["JournalName"] if journal_entry else np.nan)
    
papers["MajorFields"] = majors
papers["MinorFields"] = minors
papers["Journal"] = journals

In [17]:
papers.Journal

0                               Journal of General Virology
2                              Journal of Chemical Sciences
5         The Journal of Clinical Endocrinology and Meta...
6                   Women's Studies International Quarterly
7                                   Trabajos De Prehistoria
                                ...                        
249992         Studies in History and Philosophy of Science
249993                                  Sleep and Breathing
249995    International journal of innovative research a...
249996                                Journal of Complexity
249998                                  Thermal Engineering
Name: Journal, Length: 182007, dtype: object