In [1]:
import numpy as np
from collections import defaultdict, Counter, OrderedDict
import pickle
import yaml, json
from schema import Schema, And, Use, Optional, SchemaError

from tqdm.notebook import tqdm

#from typing import List, Optional
#from pydantic import BaseModel

from pathlib import Path
import glob

In [2]:
json_files = [
    json_file_id.split('/')[-1][:-5]
    for json_file_id in glob.glob('../data/json_files/*.json')
]

doc2json = {}

for doc in tqdm(json_files):
    # open the file in read mode
    file_path = "../data/json_files/" + doc + ".json"
    with open(file_path, "r", encoding='utf-8',) as file:
        # Load the JSON data
        doc2json[doc] = json.load(file)

  0%|          | 0/47558 [00:00<?, ?it/s]

In [3]:
VALID_CATEGORIES = {
    'Physics',
    'Chemistry',
    'Environmental and Earth Sciences',
    'Astronomy',
    'Biology and Life Sciences',
    'Medicine and Health Sciences',
    'Mathematics',
    'Engineering and Technology',
    'Social Sciences, Art, and Humanities',
    'Biographies and Book Reviews'
}

In [12]:
LLM_categories = [
    doc2json[i]['scientific_discipline'].split('>')[0].strip()
    for i in doc2json
]

total_num = len(LLM_categories)

num_valid_categories = 0

for category, count in Counter(LLM_categories).most_common():
    perc = count/total_num
    print(f"{category}") #: {perc*100:.3f}%")

    if category in VALID_CATEGORIES:
        num_valid_categories += count

perc_valid_categories = 100*(num_valid_categories/total_num)
print(f"Perecentage of valid categories: {perc_valid_categories:.2f}%" )


Biology and Life Sciences
Physics
Chemistry
Environmental and Earth Sciences
Engineering and Technology
Mathematics
Earth Sciences
Social Sciences, Art, and Humanities
Astronomy
Medicine and Health Sciences
Materials Science
Psychology
Biology
Social Sciences
Geology
Neuroscience
Medicine
Biographies
Biographies and Book Reviews
History of Science
Material Science
Philosophy
Archaeology
Computer Science
Mechanics
Engineering
Immunology
Earth and Environmental Sciences
History
Linguistics
Statistics
Art and Humanities
Artificial Intelligence
Agriculture
Economics
Cognitive Science
Anthropology
Music
Health Sciences
Fluid Dynamics
Geophysics
Physiology
Life Sciences
Philosophy of Science
Microbiology
Veterinary Medicine
Acoustics
History and Philosophy of Science
Education
Paleontology
Metallurgy
None
Pharmacology
Art History
Geochemistry
Geography
Business and Management
Science
Biochemistry
3. Environmental and Earth Sciences
Music Theory
General Science
Geodesy
Energy and Resources
Ge

In [16]:
for doc in doc2json:
    main_cat = doc2json[doc]['scientific_discipline'].split('>')[0].strip()

    try:
        sub_cat = doc2json[doc]['scientific_discipline'].split('>')[1].strip()
    except IndexError:
        pass
        #print(f"IndexError: {doc2json[doc]['scientific_discipline']}")

    if main_cat == 'Sports Science':
        print(f"{doc}:")
        print(f"Main category: {main_cat}")
        print(f"Sub category: {sub_cat}")
        print(f"title: {doc2json[doc]['revised_title']}")
        print(f"Tags: {doc2json[doc]['semantic_tags']}")
        print(f"TLDR: {doc2json[doc]['tldr']}")

rstb_1990_0144:
Main category: Sports Science
Sub category: Athletics
title: Optimum Techniques for High and Long Jumping
Tags: ['Athletics', 'Biomechanics', 'Sports Performance', 'Muscle Physiology', 'Jumping Techniques']
TLDR: The article discusses the optimum take-off techniques for high and long jumps, analyzing the speed and leg angle of athletes. A simple model is used, considering the properties of leg muscles, to predict force patterns and jump performance. The findings suggest that high jumpers should run at moderate speeds and set down the foot well in front of the body, while long jumpers should run faster and place the foot less far forward with a steeper angle.


In [8]:
LLM_subcategories = [
    doc2json[i]['scientific_discipline'].split('>')[1].strip()
    for i in doc2json if doc2json[i]['scientific_discipline'].count('>') > 0
]

total_num = len(LLM_subcategories)


for category, count in Counter(LLM_subcategories).most_common():
    perc = count/total_num
    print(f"{category}: {perc*100:.3f}%")

Electromagnetism: 2.960%
Fluid Dynamics: 2.751%
Optics: 2.644%
Materials Science: 2.577%
Zoology: 2.537%
Geology: 1.904%
Organic Chemistry: 1.809%
Biochemistry: 1.603%
Thermodynamics: 1.599%
Neuroscience: 1.565%
Crystallography: 1.357%
Plant Biology: 1.237%
Meteorology: 1.201%
Spectroscopy: 1.170%
Oceanography: 1.167%
History of Science: 1.138%
Microbiology: 1.102%
Cell Biology: 1.092%
Physiology: 1.050%
Nuclear Physics: 0.987%
Atmospheric Science: 0.982%
Electrochemistry: 0.980%
Applied Mathematics: 0.930%
Developmental Biology: 0.827%
Genetics: 0.795%
Neurobiology: 0.766%
Astrophysics: 0.745%
Particle Physics: 0.745%
Atomic Physics: 0.724%
Geomagnetism: 0.717%
Evolutionary Biology: 0.717%
Botany: 0.692%
Physical Chemistry: 0.688%
Paleontology: 0.684%
Geophysics: 0.667%
Ecology: 0.665%
Mechanics: 0.652%
Observational Astronomy: 0.646%
Entomology: 0.633%
Immunology: 0.616%
Molecular Biology: 0.576%
Chemical kinetics: 0.560%
Atomic and Molecular Physics: 0.555%
Astronomy: 0.551%
Mechani

In [10]:
for doc in doc2json:
    main_cat = doc2json[doc]['scientific_discipline'].split('>')[0].strip()

    try:
        sub_cat = doc2json[doc]['scientific_discipline'].split('>')[1].strip()
        
        if sub_cat == 'Linguistics':
            print(f"{doc}:")
            print(f"Main category: {main_cat}")
            print(f"Sub category: {sub_cat}")
            print(f"title: {doc2json[doc]['revised_title']}")
            print(f"Tags: {doc2json[doc]['semantic_tags']}")
            print(f"TLDR: {doc2json[doc]['tldr']}")
    except IndexError:
        pass
        #print(f"IndexError: {doc2json[doc]['scientific_discipline']}")

   

101583:
Main category: Social Sciences, Art, and Humanities
Sub category: Linguistics
title: Teaching a Dumb and Deaf Person to Speak and Understand Language
Tags: ['Language Acquisition', 'Deaf Education', 'Communication', 'Linguistics', 'Sign Language']
TLDR: The author, Dr. John Wallis, discusses his attempts to teach a dumb and deaf person to speak and understand language. He describes the challenges faced, such as teaching pronunciation and understanding language without the aid of hearing. Wallis also explores the possibility of using sign language and alternative characters to represent thoughts and ideas, highlighting the potential for communication between people with different languages and abilities.
101850:
Main category: Social Sciences, Art, and Humanities
Sub category: Linguistics
title: A Proposal for an Universal Alphabet
Tags: ['Linguistics', 'Language Development', 'Phonetics', 'Alphabet Design', 'Writing Systems']
TLDR: The author, Francis Lodwick, presents a propos

In [17]:
LLM_subcategories = [
    doc2json[i]['scientific_discipline']#.split('>')[1].strip()
    for i in doc2json if doc2json[i]['scientific_discipline'].count('>') > 0
]

total_num = len(LLM_subcategories)


for category, count in Counter(LLM_subcategories).most_common():
    perc = count/total_num
    print(f"{category}: {perc*100:.3f}%")

Physics > Electromagnetism: 2.953%
Physics > Optics: 2.577%
Physics > Fluid Dynamics: 2.297%
Biology and Life Sciences > Zoology: 2.072%
Chemistry > Organic Chemistry: 1.809%
Biology and Life Sciences > Neuroscience: 1.550%
Earth Sciences > Geology: 1.546%
Biology and Life Sciences > Plant Biology: 1.218%
Physics > Materials Science: 1.207%
Physics > Thermodynamics: 1.207%
Biology and Life Sciences > Microbiology: 1.083%
Biology and Life Sciences > Cell Biology: 1.081%
Social Sciences, Art, and Humanities > History of Science: 1.071%
Environmental and Earth Sciences > Meteorology: 1.069%
Biology and Life Sciences > Biochemistry: 1.045%
Physics > Nuclear Physics: 0.978%
Environmental and Earth Sciences > Oceanography: 0.968%
Chemistry > Spectroscopy: 0.959%
Mathematics > Applied Mathematics: 0.894%
Biology and Life Sciences > Physiology: 0.892%
Environmental and Earth Sciences > Atmospheric Science: 0.890%
Chemistry > Electrochemistry: 0.854%
Biology and Life Sciences > Developmental Bi