In [1]:
import pymongo
import pandas as pd
import numpy as np
from pathlib import Path

### Connecting to Database

In [2]:
db_name = "arxiv-db"
collection_name = "arxiv-dataset-collection"

db = pymongo.MongoClient(host="localhost", port=21000).get_database(db_name)
collection = db[collection_name]
collection

Collection(Database(MongoClient(host=['localhost:21000'], document_class=dict, tz_aware=False, connect=True), 'arxiv-db'), 'arxiv-dataset-collection')

### Querying Documents

In [3]:
# count
collection.count_documents({})

2258347

In [4]:
# fetch one record
record_one = collection.find_one()
record_one

{'_id': ObjectId('64729c935ced617335d85d64'),
 'id': '0704.0001',
 'submitter': 'Pavel Nadolsky',
 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'comments': '37 pages, 15 figures; published version',
 'journal-ref': 'Phys.Rev.D76:013009,2007',
 'doi': '10.1103/PhysRevD.76.013009',
 'report-no': 'ANL-HEP-PR-07-12',
 'categories': 'hep-ph',
 'license': None,
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Goo

In [5]:
# identify fields and their types
type_dict = dict(
    zip(
        record_one.keys(),
        [type(record_one[k]) for k in record_one.keys()]
    )
)

del type_dict['_id']
type_dict

{'id': str,
 'submitter': str,
 'authors': str,
 'title': str,
 'comments': str,
 'journal-ref': str,
 'doi': str,
 'report-no': str,
 'categories': str,
 'license': NoneType,
 'abstract': str,
 'versions': list,
 'update_date': str,
 'authors_parsed': list}

In [6]:
from pydantic import BaseModel
from typing import Union, List

class ArxivRecord(BaseModel):
    id: str
    submitter: str
    authors: str
    title: str
    comments: str
    doi: str
    categories: str
    license: Union[None, str]
    abstract: str
    versions: List
    authors_parsed: List
        
    @property
    def categories_list(self):
        return self.categories.split()

In [7]:
obj = ArxivRecord(**record_one)
obj.categories_list

['hep-ph']

### Explore Unique Categories

In [11]:
categories = collection.distinct("categories")
categories[:5]

['acc-phys hep-ex physics.acc-ph',
 'acc-phys physics.acc-ph',
 'adap-org astro-ph cond-mat nlin.AO',
 'adap-org bayes-an nlin.AO',
 'adap-org chao-dyn cond-mat.stat-mech math.DS nlin.AO nlin.CD physics.bio-ph physics.chem-ph q-bio.PE']

In [30]:
categories[100:105]

['astro-ph chao-dyn nlin.CD',
 'astro-ph chao-dyn nlin.CD physics.flu-dyn',
 'astro-ph chem-ph',
 'astro-ph comp-gas nlin.CG',
 'astro-ph cond-mat']

In [31]:
categories[200:205]

['astro-ph cs.NA cs.NE math.NA nlin.CD',
 'astro-ph cs.NE stat.ML',
 'astro-ph cs.OH physics.data-an physics.space-ph',
 'astro-ph gr-qc',
 'astro-ph gr-qc hep-ex']

In [33]:
categories[50000:50005]

['math.OC math.CO math.RT',
 'math.OC math.CO math.ST stat.TH',
 'math.OC math.CO q-bio.MN q-bio.QM',
 'math.OC math.CO q-bio.QM',
 'math.OC math.CT']

In [12]:
len(categories)

76869

In [13]:
cat_set = set()
[cat_set.update(x.split()) for x in categories]
len(cat_set)

176

In [14]:
cat_set

{'acc-phys',
 'adap-org',
 'alg-geom',
 'ao-sci',
 'astro-ph',
 'astro-ph.CO',
 'astro-ph.EP',
 'astro-ph.GA',
 'astro-ph.HE',
 'astro-ph.IM',
 'astro-ph.SR',
 'atom-ph',
 'bayes-an',
 'chao-dyn',
 'chem-ph',
 'cmp-lg',
 'comp-gas',
 'cond-mat',
 'cond-mat.dis-nn',
 'cond-mat.mes-hall',
 'cond-mat.mtrl-sci',
 'cond-mat.other',
 'cond-mat.quant-gas',
 'cond-mat.soft',
 'cond-mat.stat-mech',
 'cond-mat.str-el',
 'cond-mat.supr-con',
 'cs.AI',
 'cs.AR',
 'cs.CC',
 'cs.CE',
 'cs.CG',
 'cs.CL',
 'cs.CR',
 'cs.CV',
 'cs.CY',
 'cs.DB',
 'cs.DC',
 'cs.DL',
 'cs.DM',
 'cs.DS',
 'cs.ET',
 'cs.FL',
 'cs.GL',
 'cs.GR',
 'cs.GT',
 'cs.HC',
 'cs.IR',
 'cs.IT',
 'cs.LG',
 'cs.LO',
 'cs.MA',
 'cs.MM',
 'cs.MS',
 'cs.NA',
 'cs.NE',
 'cs.NI',
 'cs.OH',
 'cs.OS',
 'cs.PF',
 'cs.PL',
 'cs.RO',
 'cs.SC',
 'cs.SD',
 'cs.SE',
 'cs.SI',
 'cs.SY',
 'dg-ga',
 'econ.EM',
 'econ.GN',
 'econ.TH',
 'eess.AS',
 'eess.IV',
 'eess.SP',
 'eess.SY',
 'funct-an',
 'gr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'm

### Leading Categories
- Category at the start of the hierarchy

In [34]:
leading_categories = set()

[leading_categories.update([x.split()[0]]) for x in categories]
len(leading_categories)

172

In [35]:
leading_categories

{'acc-phys',
 'adap-org',
 'alg-geom',
 'ao-sci',
 'astro-ph',
 'astro-ph.CO',
 'astro-ph.EP',
 'astro-ph.GA',
 'astro-ph.HE',
 'astro-ph.IM',
 'astro-ph.SR',
 'atom-ph',
 'bayes-an',
 'chao-dyn',
 'chem-ph',
 'cmp-lg',
 'comp-gas',
 'cond-mat',
 'cond-mat.dis-nn',
 'cond-mat.mes-hall',
 'cond-mat.mtrl-sci',
 'cond-mat.other',
 'cond-mat.quant-gas',
 'cond-mat.soft',
 'cond-mat.stat-mech',
 'cond-mat.str-el',
 'cond-mat.supr-con',
 'cs.AI',
 'cs.AR',
 'cs.CC',
 'cs.CE',
 'cs.CG',
 'cs.CL',
 'cs.CR',
 'cs.CV',
 'cs.CY',
 'cs.DB',
 'cs.DC',
 'cs.DL',
 'cs.DM',
 'cs.DS',
 'cs.ET',
 'cs.FL',
 'cs.GL',
 'cs.GR',
 'cs.GT',
 'cs.HC',
 'cs.IR',
 'cs.IT',
 'cs.LG',
 'cs.LO',
 'cs.MA',
 'cs.MM',
 'cs.MS',
 'cs.NA',
 'cs.NE',
 'cs.NI',
 'cs.OH',
 'cs.OS',
 'cs.PF',
 'cs.PL',
 'cs.RO',
 'cs.SC',
 'cs.SD',
 'cs.SE',
 'cs.SI',
 'cs.SY',
 'dg-ga',
 'econ.EM',
 'econ.GN',
 'econ.TH',
 'eess.AS',
 'eess.IV',
 'eess.SP',
 'eess.SY',
 'funct-an',
 'gr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'm

### Terminal Categories
- Category at the bottom of the hierarchy

In [28]:
terminal_categories = set()

[terminal_categories.update([x.split()[-1]]) for x in categories]
len(terminal_categories)

164

In [29]:
terminal_categories

{'astro-ph',
 'astro-ph.CO',
 'astro-ph.EP',
 'astro-ph.GA',
 'astro-ph.HE',
 'astro-ph.IM',
 'astro-ph.SR',
 'atom-ph',
 'bayes-an',
 'chem-ph',
 'cond-mat',
 'cond-mat.dis-nn',
 'cond-mat.mes-hall',
 'cond-mat.mtrl-sci',
 'cond-mat.other',
 'cond-mat.quant-gas',
 'cond-mat.soft',
 'cond-mat.stat-mech',
 'cond-mat.str-el',
 'cond-mat.supr-con',
 'cs.AI',
 'cs.AR',
 'cs.CC',
 'cs.CE',
 'cs.CG',
 'cs.CL',
 'cs.CR',
 'cs.CV',
 'cs.CY',
 'cs.DB',
 'cs.DC',
 'cs.DL',
 'cs.DM',
 'cs.DS',
 'cs.ET',
 'cs.FL',
 'cs.GL',
 'cs.GR',
 'cs.GT',
 'cs.HC',
 'cs.IR',
 'cs.LG',
 'cs.LO',
 'cs.MA',
 'cs.MM',
 'cs.MS',
 'cs.NA',
 'cs.NE',
 'cs.NI',
 'cs.OH',
 'cs.OS',
 'cs.PF',
 'cs.PL',
 'cs.RO',
 'cs.SC',
 'cs.SD',
 'cs.SE',
 'cs.SI',
 'cs.SY',
 'econ.EM',
 'econ.GN',
 'econ.TH',
 'eess.AS',
 'eess.IV',
 'eess.SP',
 'eess.SY',
 'gr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'math.AC',
 'math.AG',
 'math.AP',
 'math.AT',
 'math.CA',
 'math.CO',
 'math.CT',
 'math.CV',
 'math.DG',
 'math.DS',
 'm

### Leading and terminal intersection

In [38]:
intersect_cat = leading_categories & terminal_categories
len(intersect_cat)

160

### Label Encoding

In [60]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
le.fit(list(cat_set))

ohe = OneHotEncoder()
ohe.fit(np.array(list(cat_set)).reshape(-1, 1))

In [39]:
categories

['acc-phys hep-ex physics.acc-ph',
 'acc-phys physics.acc-ph',
 'adap-org astro-ph cond-mat nlin.AO',
 'adap-org bayes-an nlin.AO',
 'adap-org chao-dyn cond-mat.stat-mech math.DS nlin.AO nlin.CD physics.bio-ph physics.chem-ph q-bio.PE',
 'adap-org chao-dyn nlin.AO nlin.CD',
 'adap-org chao-dyn nlin.AO nlin.CD nlin.PS patt-sol',
 'adap-org chao-dyn nlin.AO nlin.CD physics.med-ph q-bio',
 'adap-org comp-gas nlin.AO nlin.CG',
 'adap-org cond-mat hep-lat nlin.AO',
 'adap-org cond-mat nlin.AO',
 'adap-org cond-mat nlin.AO nlin.CD',
 'adap-org cond-mat nlin.AO nlin.PS patt-sol',
 'adap-org cond-mat nlin.AO nlin.SI solv-int',
 'adap-org cond-mat nlin.AO physics.optics',
 'adap-org cond-mat nlin.AO q-bio',
 'adap-org cond-mat.dis-nn cond-mat.stat-mech nlin.AO physics.bio-ph q-bio.NC',
 'adap-org cond-mat.dis-nn math.DS nlin.AO nlin.PS patt-sol physics.bio-ph',
 'adap-org cond-mat.dis-nn nlin.AO',
 'adap-org cond-mat.soft nlin.AO',
 'adap-org cond-mat.soft nlin.AO physics.bio-ph q-bio',
 'adap-

### Read record and transform its category

In [44]:
record = collection.find_one()
record

{'_id': ObjectId('64729c935ced617335d85d64'),
 'id': '0704.0001',
 'submitter': 'Pavel Nadolsky',
 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'comments': '37 pages, 15 figures; published version',
 'journal-ref': 'Phys.Rev.D76:013009,2007',
 'doi': '10.1103/PhysRevD.76.013009',
 'report-no': 'ANL-HEP-PR-07-12',
 'categories': 'hep-ph',
 'license': None,
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Goo

In [49]:
record['categories'].split()

['hep-ph']

In [50]:
le.transform(record['categories'].split())

array([79])