In [1]:
import pandas as pd 
import numpy as np
import ast
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer, util # type: ignore

In [4]:
model = SentenceTransformer('msmarco-distilbert-base-tas-b')

In [2]:
df = pd.read_csv(r'arxiv-paper-abstracts\arxiv_data_210930-054931.csv')

In [3]:
def clean_categories(categories):
    tags = [tag.strip() for tag in ast.literal_eval(categories)]
    return tags

In [4]:
categories = pd.DataFrame(list(df['terms'].unique()),columns=['categories'])
categories['clean_categories'] = categories['categories'].apply(clean_categories)

In [5]:
tags = set()
for index, row in categories.iterrows():
    categories = row['clean_categories']
    for category in categories:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])

In [6]:
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])

In [7]:
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        if '(' in category and ')' in category:
            subcategories = [subcat.strip() for subcat in category.split() if subcat[0].isdigit()]
            for subcategory in subcategories:
                tags.add(subcategory)
        else:
            tags.add(category)
tags.discard('')

In [8]:
tags_list = []
for tag in tags:
    tag = tag.replace('(Primary)', '')
    tag = tag.replace('Primary', '')
    tag = tag.replace('(Secondary)', '')
    tag = tag.replace('(secondary)', '')
    tag = tag.replace('secondary', '')
    tag = tag.replace('Secondary', '')
    tag = tag.replace('and ', '')
    tags_list.append(tag.strip())
print(tags_list)

['eess.AS', 'K.4.4', '62J12', '68T30', 'q-bio.BM', 'cs.IT', '62H25', '68: computer science', '65Z05', '93E10', '37N99', '91B10', '62J10', 'q-fin.GN', '68P20', '62E10', '35K15', '65M99', '05C62', '51N05', '03D10', '05C60', '15A69', '90Cxx', '47B34', 'D.2.8', '37Nxx', 'math.NT', 'cs.ET', '14M25', 'H.3.3', '94A60', '60J27', 'G.1.0', 'cs.SC', 'physics.data-an', '53B20', 'hep-ph', '05C81', 'math.HO', 'cs.CR', 'physics.chem-ph', 'K.3.8', '14R10', '94A15', 'astro-ph.GA', '55N99', '55U99', '62-02', '62G09', '65N21', '91F20', '14F05', 'physics.med-ph', 'I.2.m', 'cs.DC', '91C20', 'I.2.1 Applications Expert Systems', '34C20', '62M15', 'cs.MM', 'I.1.2', '51K05', '60G60', '68W15', 'cs.SI', 'B.7.2', '65F30', '60J20', '44A12', '35A18', '62F15', 'cs.MS', '14M07', 'quant-ph', 'G.4', '62H99', 'physics.ed-ph', 'I.2.8', '06B99', 'I.6.8', '62H20', '68U10', '2010', '6804', '68U05', 'cs.LG', 'q-fin.CP', '14J26', 'D.4.6', '65Yxx', 'J.1', '62F25', 'E.0', '62C10', '62J07', '92B20', 'nlin.CG', 'I.4.6', '49Q22', 

In [9]:
# create empty lists for each classification
msc = set()
ccs = set()
arxiv = set()
misc = set()

for tag in tags_list:
    if re.match(r'\d{2}[A-Z]\d{2}|\d{2}[A-Z]-xx|\d{2}\w{2,}', tag):
        msc.add(tag)
    elif re.match(r'\d{2}-\d{2}|\d{2}-xx|\d{2}-XX', tag):
        msc.add(tag)
    elif re.match(r'\d{2}', tag):
        msc.add(tag)
    elif re.match(r'[A-Z]\.\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+-\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+', tag):
        arxiv.add(tag)
    else:
        misc.add(tag)

In [11]:
print(f'MSC: {len(msc)}\nCCS: {len(ccs)}\nArXiv: {len(arxiv)}\nMisc: {len(misc)}')

MSC: 453
CCS: 162
ArXiv: 149
Misc: 5


**Contents of misc:**
- Artificial intelligence': corresponds to 68Txx
- Computer science applications- 97R60 Computer graphics': corresponds to 97R60
- Computing methodologies for image processing': corresponds to 68U10
- This paper tells us how human can be identified by their Gait cycle\n  using any simple camera': dropped
- org: dropped

In [12]:
msc.add('97R60')
del misc

In [13]:
msc_cleaned = pd.DataFrame(list(msc), columns=['msc'])
ccs_cleaned = pd.DataFrame(list(ccs), columns=['ccs'])
arxiv_cleaned = pd.DataFrame(list(arxiv), columns=['arxiv'])

#### Retrieve Descriptions for Each Tag

In [14]:
msc_cleaned

Unnamed: 0,msc
0,62J12
1,68T30
2,62H25
3,68: computer science
4,65Z05
...,...
449,78M32
450,47G30
451,47N30
452,03B52


In [20]:
msc_codes = pd.read_csv(r'msc_classification.csv')
acm_codes = pd.read_csv(r'acm_classification.csv')

In [21]:
for code in msc_cleaned['msc']:
    if code not in msc_codes['code'].values:
        print(code)

68: computer science
14F05
65F30
2010
6804
68T04
68U20
65C60
74S30
62h30
62-07l
65C50
68T05 91E40
6006
10010147.10010257.10010258.10010259.10010263
35CXX
68T45 68T07
2020: 49N45
92B25 92F99
62-07
62-09
97R60
68Tx
97R40


In [22]:
for code in ccs_cleaned['ccs']:
    if code not in acm_codes['class'].values:
        print(code)

J.2.
G.3.7
I.4.m I.2.7
K.3.8
F.2.4
I.5.2.b
J.2.4
J.3.1
I5.3
I.2.4.j
I.2.7.g
I.2.0.b
J.2.5
C.4.4
I.1.5
I.2.10.f
I.5.4.m
I.2.1 Applications Expert Systems
G.3.11
I.5.4.b
G.2.6
