In [None]:
import pandas as pd 
import numpy as np
import ast
import re
from bs4 import BeautifulSoup
import PyPDF2 # type: ignore
from sentence_transformers import SentenceTransformer, util # type: ignore

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

#### Tags Text Cleaning

In [None]:
def clean_categories(categories):
    tags = [tag.strip() for tag in ast.literal_eval(categories)]
    return tags

In [None]:
df = pd.read_csv(r'arxiv-paper-abstracts\arxiv_data_210930-054931.csv')
categories = pd.DataFrame(list(df['terms'].unique()),columns=['categories'])
categories['clean_categories'] = categories['categories'].apply(clean_categories)
tags = set()
for index, row in categories.iterrows():
    categories = row['clean_categories']
    for category in categories:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        if '(' in category and ')' in category:
            subcategories = [subcat.strip() for subcat in category.split() if subcat[0].isdigit()]
            for subcategory in subcategories:
                tags.add(subcategory)
        else:
            tags.add(category)
tags.discard('')


In [None]:
tags_list = []
for tag in tags:
    tag = tag.replace('(Primary)', '')
    tag = tag.replace('Primary', '')
    tag = tag.replace('(Secondary)', '')
    tag = tag.replace('(secondary)', '')
    tag = tag.replace('secondary', '')
    tag = tag.replace('Secondary', '')
    tag = tag.replace('and ', '')
    tags_list.append(tag.strip())
print(tags_list)

In [None]:
# create empty lists for each classification
msc = set()
ccs = set()
arxiv = set()
misc = set()

for tag in tags_list:
    if re.match(r'\d{2}[A-Z]\d{2}|\d{2}[A-Z]-xx|\d{2}\w{2,}', tag):
        msc.add(tag)
    elif re.match(r'\d{2}-\d{2}|\d{2}-xx|\d{2}-XX', tag):
        msc.add(tag)
    elif re.match(r'\d{2}', tag):
        msc.add(tag)
    elif re.match(r'[A-Z]\.\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+-\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+', tag):
        arxiv.add(tag)
    else:
        misc.add(tag)


In [None]:
print(f'MSC: {len(msc)}\nCCS: {len(ccs)}\nArXiv: {len(arxiv)}\nMisc: {len(misc)}')

**Contents of misc:**
- Artificial intelligence': corresponds to 68Txx
- Computer science applications- 97R60 Computer graphics': corresponds to 97R60
- Computing methodologies for image processing': corresponds to 68U10
- This paper tells us how human can be identified by their Gait cycle\n  using any simple camera': dropped
- org: dropped

In [None]:
msc.add('97R60')
del misc

In [None]:
msc_cleaned = pd.DataFrame(list(msc), columns=['msc'])
ccs_cleaned = pd.DataFrame(list(ccs), columns=['ccs'])
arxiv_cleaned = pd.DataFrame(list(arxiv), columns=['arxiv'])

#### Retrieve Descriptions for Each Tag

In [None]:
msc_html = r'C:\Users\User\Desktop\ml_paper_recommender_system\paper-recommender\Classification Codes Data\MSC Classification Codes.html'

# Parse MSC Classification Codes html file
with open(msc_html) as fp:
    msc_soup = BeautifulSoup(fp, 'html.parser')

# Extract MSC Classification Codes
msc_tree = [element.text for element in msc_soup.find_all('li')]

# Define Dictionary for MSC Classification Codes where the key is the main class and the value is a list of sub-classes
msc_classes = {}

# Populate the dictionary
for cls in msc_tree:
    if re.match(r'^\d\d-xx:', cls):
        msc_classes[cls.split('\n')[0]] = []

for element in msc_tree:
    for cls in msc_classes.keys():
        if cls[0:2] == element[0:2]:
            msc_classes[cls].append(element)

for cls in msc_classes.keys():
    msc_classes[cls].pop(0)

# Create a dataframe from the dictionary
msc_df = pd.DataFrame(msc_classes.items(),columns=['class','sub_classes'])

In [None]:
for i, subclass in enumerate(msc_df['sub_classes']):
    for idx, sub in enumerate(subclass):
        if re.match(r'\d+\d+[A-Z]xx', sub):
            msc_df['sub_classes'][i][idx] = sub.split('\n')[0]

In [None]:
for i, subclass in enumerate(msc_df['sub_classes']):
    curr_cls = msc_df['class'][i]
    for idx, sub in enumerate(subclass):
        msc_df['sub_classes'][i][idx] = curr_cls + ' ' + msc_df['sub_classes'][i][idx]

In [None]:
for idx, tag in enumerate(msc_cleaned['msc']):
    if ':' in tag:
        msc_cleaned['msc'][idx] = tag.split(':')[0]

In [None]:
msc_cleaned['description'] = None

In [None]:
for i, tag in enumerate(msc_cleaned['msc']):
    for j, subclass in enumerate(msc_df['sub_classes']):
        for k, cls in enumerate(subclass):
            if tag in cls:
                msc_cleaned['description'][i] = cls

In [None]:
missing_msc = []
for idx, description in enumerate(msc_cleaned['description']):
    if description is None:
        missing_msc.append(msc_cleaned['msc'][idx])
print(f'Status:\n{len(missing_msc)} out of {len(msc_cleaned)} tags are missing a description')

In [None]:
# Extract Description from MSC Classification Codes 2020
reader = PyPDF2.PdfReader(r'C:\Users\User\Desktop\ml_paper_recommender_system\paper-recommender\Classification Codes Data\msc2020.pdf')
missing_corpus = []
for idx, page in enumerate(reader.pages):
    text = page.extract_text()
    for idx, code in enumerate(missing_msc):
        if code in text:
            missing_corpus.append(text)
            # print(f'Page {idx+1}: {code}')

In [None]:
for idx, text in enumerate(missing_corpus):
    missing_corpus[idx] = text.split('\n')

In [None]:
missing_descriptions = set()
for code in missing_msc:
    for idx, text in enumerate(missing_corpus):
        for line in text:
            if code in line:
                desc = f'{code}: {line}'
                missing_descriptions.add(desc)

In [None]:
# Inspected manually because data quality right?
missing_descriptions = [
 '60B20: 60B20 Random matrices (probabilistic aspects)',
 '62H22: 62H22 Probabilistic graphical models',
 '97P30: 97P30 Systems, databases (educational aspects)',
 '05C81: 05C81 Random walks on graphs',
 '62B86: 62B86 Statistical aspects of fuzziness, sufficiency, and information',
 '68T42: 68T42 Agent technology and artificial intelligence',
 '97N80: 97N80 Mathematical software, computer programs (educational aspects)',
 '60L10: 60L10 Signatures and data streams',
 '35Q68: 35Q68 PDEs in connection with computer science',
 '68T07: 68T07 Artificial neural networks and deep learning',
 '60L20: 60L20 Rough paths',
 '62-08: 62-08 Computational methods for problems pertaining to statistics',
 '35Q79: 35Q79 PDEs in connection with classical thermodynamics and heat transfer',
 '15B48: 15B48 Positive matrices and their generalizations; cones of matrices',
 '68W50: 68W50 Evolutionary algorithms, genetic algorithms (computational aspects)',
 '62-XX: 62-XX Statistics',
 '65Zxx: 65Zxx Applications to the sciences',
 '62G86: 62G86 Nonparametric inference and fuzziness',
 '62R40: 62R40 Topological data analysis',
 '05C21: 05C21 Flows in graphs',
 "35J08: 35J08 Green's functions for elliptic equations",
 '91-10: 91-10 Mathematical modeling or simulation for problems pertaining to game theory, economics, and Finance',
 '90-05: 90-05 Experimental work for problems pertaining to operations research and mathematical programming',
 '68W27: 68W27 Online algorithms; streaming algorithms',
 '14T10: 14T10 Foundations of tropical geometry and relations with algebra',
 '62D20: 62D20 Causal inference from observational studies',
 '49Q22: 49Q22 Optimal transportation',
 '35R02: 35R02 PDEs on graphs and networks (ramified or polygonal spaces)',
 '57-08: 57-08 Computational methods for problems pertaining to manifolds and cell complexes',
 '65F55: 65F55 Numerical methods for low-rank matrix approximation; matrix compression',
 '90C17: 90C17 Robustness in mathematical programming',
 '53Z50: 53Z50 Applications of differential geometry to data and computer science',
 '35J47: 35J47 Second-order elliptic systems',
 '62R07: 62R07 Statistical aspects of big data and data science',
 '15A83: 15A83 Matrix completion problems',
 '91G80: 91G80 Financial applications of other theories',
 '54H30: 54H30 Applications of general topology to computer science (e.g., digital topology, image processing)',
 '35K08: 35K08 Heat kernel',
 '90-10: 90-10 Mathematical modeling or simulation for problems pertaining to operations research and mathematical programming',
 '78M32: 78M32 Neural and heuristic methods applied to problems in optics and electromagnetic theory',
 '62A09: 62A09 Graphical methods in statistics',
 '35Q74: 35Q74 PDEs in connection with mechanics of deformable solids',
 '55N31: 55N31 Persistent homology and applications, topological data analysis',
 '68-XX: 68-XX Computer science',
 '65D19: 65D19 Computational issues in computer and robotic vision',
 '68M25: 68M25 Computer security',
 '62-08: 62-08 Computational methods for problems pertaining to statistics',
 '92-XX: 92-XX Biology and other natural sciences',
 '35-XX: 35-XX Partial differential equations',
 '68Q87: 68Q87 Probability in computer science (algorithm analysis, random structures, phase transitions, etc.)',
 '60B20: 15B52 Random matrices (algebraic aspects) fFor probabilistic aspects, see 60B20 g',
 '91G70: 91G70 Statistical methods; risk measures [See also 62P05, 62P20]',
 '62R01: 62R01 Algebraic statistics',
 '49Q22: 35Q49 Transport equations fFor calculus of variations and optimal control, see 49Q22; for Fluid mechanics, see',
 '60B20: 60B20 Random matrices (probabilistic aspects)',
 '49M41: 49M41 PDE constrained optimization (numerical aspects)',
 '57Z25: 57Z25 Relations of manifolds and cell complexes with computer and data science',
 '49-11: 49-11 Research data for problems pertaining to calculus of variations and optimal control',
 '35Q84: 35Q84 Fokker-Planck equations',
 '42-08: 42-08 Computational methods for problems pertaining to harmonic analysis on Euclidean spaces',
 '68T09: 68T09 Computational aspects of data analysis and big data',
 '65M22: 65M22 Numerical solution of discretized equations for initial value and initial-boundary value problems involving ']

for idx, desc in enumerate(missing_descriptions):
    missing_descriptions[idx] = desc.split(':')


for idx, description in enumerate(missing_descriptions):
    m, d = missing_descriptions[idx]
    for idx2, code in enumerate(msc_cleaned['msc']):
        if msc_cleaned['msc'][idx2] == m:
            msc_cleaned['description'][idx2] = d.strip()
    

In [None]:
# Split and add misplaced string back to ACM codes
print(f'Before: {len(ccs_cleaned)}')
acm_misplaced_string = '10010147.10010257.10010258.10010259.10010263'.split('.')
for str in acm_misplaced_string:
    ccs_cleaned.loc[-1] = str
    ccs_cleaned.index = ccs_cleaned.index + 1
    ccs_cleaned = ccs_cleaned.sort_index()
print(f'After: {len(ccs_cleaned)}')

In [None]:
rem = msc_cleaned[msc_cleaned['description'].isnull()]['msc'].values
rem = ['60-06', '68T05', '91E40', '92B25', '92F99', '35CXX', '97R60', '68-04',
       '68T45', '68T07', '62h30', '97R40', '62-07', '68T04']
found_rem = []
for r in rem:
    desc = msc_cleaned['description'][idx]
    if r in msc_cleaned['msc'].values:
        print(f'Found: {r}')
        desc = msc_cleaned[msc_cleaned['msc'] == r]['description'].values[0]
        print(f'Description: {desc}')
        if desc is not None:
            found_rem.append(r)
    else:
        print(f'Not found: {r}')

In [None]:
rem = set(rem) - set(found_rem)
rem = list(rem)
rem

In [None]:
# TODO: Descriptions should have main class >> sub class >> sub sub class

In [None]:
# Don't append before finishing TODO above
missing_descriptions.append([['97R60', '97-XX Mathematics education'], 
                            ['92B25', '92-XX Biology and other natural sciences 92Bxx Mathematical biology in general 92B25 Biological rhythms and synchronization'],
                            ['68T04', '68-XX Computer science 68Txx Artificial intelligence 68T05 Learning and adaptive systems in artificial intelligence'],
                            ['97R40', '97-XX Mathematics education'],
                            ['92F99', '92-XX Biology and other natural sciences 92Fxx Other natural sciences (mathematical treatment)'],
                            ['62H30', '62-XX Statistics 62Hxx Multivariate analysis  62H30 Classification and discrimination; cluster analysis (statistical aspects); mixture models'],
                            ['35CXX', '35Cxx Representations of solutions to partial differential equations']])

In [None]:
missing_descriptions