In [23]:
import pandas as pd 
import numpy as np
import ast
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer, util # type: ignore

In [24]:
# model = SentenceTransformer('msmarco-distilbert-base-tas-b')

In [25]:
df = pd.read_csv(r'arxiv-paper-abstracts\arxiv_data_210930-054931.csv')

In [26]:
def clean_categories(categories):
    tags = [tag.strip() for tag in ast.literal_eval(categories)]
    return tags

In [27]:
categories = pd.DataFrame(list(df['terms'].unique()),columns=['categories'])
categories['clean_categories'] = categories['categories'].apply(clean_categories)

In [28]:
tags = set()
for index, row in categories.iterrows():
    categories = row['clean_categories']
    for category in categories:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])

In [29]:
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])

In [30]:
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        if '(' in category and ')' in category:
            subcategories = [subcat.strip() for subcat in category.split() if subcat[0].isdigit()]
            for subcategory in subcategories:
                tags.add(subcategory)
        else:
            tags.add(category)
tags.discard('')

In [31]:
tags_list = []
for tag in tags:
    tag = tag.replace('(Primary)', '')
    tag = tag.replace('Primary', '')
    tag = tag.replace('(Secondary)', '')
    tag = tag.replace('(secondary)', '')
    tag = tag.replace('secondary', '')
    tag = tag.replace('Secondary', '')
    tag = tag.replace('and ', '')
    tags_list.append(tag.strip())
print(tags_list)

['05C20', '68T07', '91A12', '62M15', '68Q55', '65K05', '62J05', '93C85', 'J.2.4', '62H12', 'math.CA', '49Q10', '93E35', 'G.1.4', '62-XX', '68P99', 'H.5.2', 'D.2.5', '15B48', 'cond-mat.mes-hall', '91A20', '92', '62D20', '65D19', '35CXX', 'J.2', 'cs.CE', '51M20', '62M20', '94A12', 'stat.ML', '53Z50', '68W50', 'physics.data-an', '91-10', '91A60', '62J02', 'quant-ph', '62F25', '93E03', 'H.3.1', 'I.4.0', 'J.7', 'I.2.4', '78M32', '91A06', '74Pxx', '62J10', 'econ.GN', '58J35', '68Tx', '94A34', '90C20', '68T45 68T07', '93E12', 'math.DG', 'math.OC', '55R35', 'math.GR', 'H.4.2', '91E30', 'H.5.0', '68W40', '76D07', '62H22', '14L24', '60G15', 'econ.TH', '35J20', 'I.1.5', '14M15', '60E05', 'cs.CC', '91C20', '68-06', '62C99', 'eess.AS', 'cs.OH', '74A40', '74S30', 'I.4', 'physics.ed-ph', 'I.5.2', 'D.3.4', '62E17', 'E.1', '62G07', 'astro-ph.HE', '62H25', 'math.RA', '35J47', '05C50', '37Nxx', 'H.4.m', 'I.5.0', 'physics.med-ph', 'I.4.m', 'H.5', 'H.2.7', '92C55', '65F50', '68T40', 'hep-ex', 'J.2.', '62H9

In [32]:
# create empty lists for each classification
msc = set()
ccs = set()
arxiv = set()
misc = set()

for tag in tags_list:
    if re.match(r'\d{2}[A-Z]\d{2}|\d{2}[A-Z]-xx|\d{2}\w{2,}', tag):
        msc.add(tag)
    elif re.match(r'\d{2}-\d{2}|\d{2}-xx|\d{2}-XX', tag):
        msc.add(tag)
    elif re.match(r'\d{2}', tag):
        msc.add(tag)
    elif re.match(r'[A-Z]\.\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+-\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+', tag):
        arxiv.add(tag)
    else:
        misc.add(tag)

In [33]:
print(f'MSC: {len(msc)}\nCCS: {len(ccs)}\nArXiv: {len(arxiv)}\nMisc: {len(misc)}')

MSC: 453
CCS: 162
ArXiv: 149
Misc: 5


In [34]:
msc.add('97R60')
del misc

In [35]:
msc_cleaned = pd.DataFrame(list(msc), columns=['msc'])
ccs_cleaned = pd.DataFrame(list(ccs), columns=['ccs'])
arxiv_cleaned = pd.DataFrame(list(arxiv), columns=['arxiv'])

In [36]:
msc_codes = pd.read_csv(r'msc_classification.csv')
acm_codes = pd.read_csv(r'acm_classification.csv')
arxiv_codes = pd.read_csv(r'arxiv_classification.csv')

In [41]:
with open(r'missing_descriptions.txt', 'w') as fp:
    fp.write('MSC\n')
    for code in msc_cleaned['msc']:
        if code not in msc_codes['code'].values:
            fp.write(f'{code}\n')
    fp.write('\nCCS\n')
    for code in ccs_cleaned['ccs']:
        if code not in acm_codes['class'].values:
            fp.write(f'{code}\n')
    fp.write('\n\nArXiv\n')
    for code in arxiv_cleaned['arxiv']:
        if code not in arxiv_codes['class_code'].values:
            fp.write(f'{code}\n')