In [3]:
import pandas as pd 
import numpy as np
import ast
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
import PyPDF2 # type: ignore
from sentence_transformers import SentenceTransformer, util # type: ignore

In [4]:
model = SentenceTransformer('msmarco-distilbert-base-tas-b')

#### Embedding Demo

In [5]:
def clean_categories(categories):
    tags = [tag.strip() for tag in ast.literal_eval(categories)]
    return tags

In [6]:
df = pd.read_csv(r'arxiv-paper-abstracts\arxiv_data_210930-054931.csv')
categories = pd.DataFrame(list(df['terms'].unique()),columns=['categories'])
categories['clean_categories'] = categories['categories'].apply(clean_categories)
tags = set()
for index, row in categories.iterrows():
    categories = row['clean_categories']
    for category in categories:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        if '(' in category and ')' in category:
            subcategories = [subcat.strip() for subcat in category.split() if subcat[0].isdigit()]
            for subcategory in subcategories:
                tags.add(subcategory)
        else:
            tags.add(category)
tags.discard('')

In [7]:
abstracts = df['abstracts']

In [10]:
abstracts_embedding = []
for abstract in tqdm(abstracts):
    abstracts_embedding.append(model.encode(abstract))

 18%|█▊        | 10148/56181 [53:17<7:31:01,  1.70it/s]

In [None]:
query = 'computer vision uses in mathematics'

In [None]:
query_emb = model.encode(query)


### Tag Cleaning and Embeddings

In [4]:
tags_list = []
for tag in tags:
    tag = tag.replace('(Primary)', '')
    tag = tag.replace('Primary', '')
    tag = tag.replace('(Secondary)', '')
    tag = tag.replace('(secondary)', '')
    tag = tag.replace('secondary', '')
    tag = tag.replace('Secondary', '')
    tag = tag.replace('and ', '')
    tags_list.append(tag.strip())
print(tags_list)

['62J99', 'J.2', '46N30', '62', '62G99', '2010', '92B25 92F99', 'F.2.1', '68T05 91E40', '90C40', '65T60', '68Q42', '65L07', '62-XX', 'cs.CC', '10010147.10010257.10010258.10010259.10010263', '62G20', 'physics.ins-det', 'cs.DS', '62M40', 'cs.CV', '91Bxx', '93B35', '60J27', '91A20', 'nucl-ex', '65C50', '60K35', '34H05', '91A80', 'J.7', '68T07', 'physics.acc-ph', '62B10', '57Z25', '92', '68T37', '60G99', 'G.2.6', '82C32', 'math.GN', 'H.3.4', '15B48', '41A65', '65Z05', 'econ.TH', 'H.5', 'cs.SY', 'econ.GN', 'I.3.5', 'E.1', '65N06', '94', '54H30', '35J20', '68T45 68T07', 'F.2.2', 'hep-lat', 'cs.MA', '14J60', '60G15', '65F22', '91B10', '81Pxx', '60H99', 'I.4.10', '62P12', 'adap-org', '91A06', '46M40', 'q-fin.TR', '49', '91A60', 'H.4', '93E20', '65R32', 'cs.CG', 'nucl-th', '65T50', '05A16', '15A29', '06A15', 'E.0', '91G70', '65J22', 'I.2', '35Q68', '41A05', '49Q10', 'I.4.6', '62R07', 'K.3.m', 'G.1.8', '68W40', '68W27', '35J08', '60G10', '62J10', '11Z05', 'J.0', '76T99', '65Kxx', 'H.2.7', '49M41

In [5]:
# create empty lists for each classification
msc = set()
ccs = set()
arxiv = set()
misc = set()

for tag in tags_list:
    if re.match(r'\d{2}[A-Z]\d{2}|\d{2}[A-Z]-xx|\d{2}\w{2,}', tag):
        msc.add(tag)
    elif re.match(r'\d{2}-\d{2}|\d{2}-xx|\d{2}-XX', tag):
        msc.add(tag)
    elif re.match(r'\d{2}', tag):
        msc.add(tag)
    elif re.match(r'[A-Z]\.\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+-\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+', tag):
        arxiv.add(tag)
    else:
        misc.add(tag)

In [6]:
print(f'MSC: {len(msc)}\nCCS: {len(ccs)}\nArXiv: {len(arxiv)}\nMisc: {len(misc)}')

MSC: 453
CCS: 162
ArXiv: 149
Misc: 5


**Contents of misc:**
- Artificial intelligence': corresponds to 68Txx
- Computer science applications- 97R60 Computer graphics': corresponds to 97R60
- Computing methodologies for image processing': corresponds to 68U10
- This paper tells us how human can be identified by their Gait cycle\n  using any simple camera': dropped
- org: dropped

In [7]:
msc.add('97R60')
del misc

In [8]:
msc_cleaned = pd.DataFrame(list(msc), columns=['msc'])
ccs_cleaned = pd.DataFrame(list(ccs), columns=['ccs'])
arxiv_cleaned = pd.DataFrame(list(arxiv), columns=['arxiv'])

#### Retrieve Descriptions for Each Tag

In [9]:
msc_html = r'C:\Users\User\Desktop\ml_paper_recommender_system\paper-recommender\Classification Codes Data\MSC Classification Codes.html'

# Parse MSC Classification Codes html file
with open(msc_html) as fp:
    msc_soup = BeautifulSoup(fp, 'html.parser')

# Extract MSC Classification Codes
msc_tree = [element.text for element in msc_soup.find_all('li')]

# Define Dictionary for MSC Classification Codes where the key is the main class and the value is a list of sub-classes
msc_classes = {}

# Populate the dictionary
for cls in msc_tree:
    if re.match(r'^\d\d-xx:', cls):
        msc_classes[cls.split('\n')[0]] = []

for element in msc_tree:
    for cls in msc_classes.keys():
        if cls[0:2] == element[0:2]:
            msc_classes[cls].append(element)

#for cls in msc_classes.keys():
#    msc_classes[cls].pop(0)

# Create a dataframe from the dictionary
msc_df = pd.DataFrame(msc_classes.items(),columns=['class','sub_classes'])

In [10]:
cls = []
cls_str = []
sub_cls = []
sub_cls_str = []
sub_sub_cls = []
sub_sub_cls_str = []
for idx, sub_class in enumerate(msc_df['sub_classes'].values):
    for sub_sub in sub_class:
        if re.match(r'\d\d-\d\d', sub_sub):
            cls.append(sub_sub.split(':')[0])
            cls_str.append(sub_sub.split(':')[1].strip())
        if re.match(r'\d\d\w\w\w', sub_sub):
            sub_cls.append(sub_sub.split(':')[0])
            sub_cls_str.append(sub_sub.split(':')[1].strip())
        if re.match(r'\d\d-\w+\w+', sub_sub):
            sub_cls.append(sub_sub.split(':')[0])
            sub_cls_str.append(sub_sub.split(':')[1].strip())
        if re.match(r'\d\d\w\d\d', sub_sub):
            sub_sub_cls.append(sub_sub.split(':')[0])
            sub_sub_cls_str.append(sub_sub.split(':')[1].strip())
print(f'Classes: {len(cls)} {len(cls_str)}, Sub-Classes: {len(sub_cls)} {len(sub_cls_str)}, Sub-Sub-Classes: {len(sub_sub_cls)} {len(sub_sub_cls_str)}')

cls_comb = zip(cls, cls_str)
cls_comb = list(cls_comb)
sub_cls_comb = zip(sub_cls, sub_cls_str)
sub_cls_comb = list(sub_cls_comb)
sub_sub_cls_comb = zip(sub_sub_cls, sub_sub_cls_str)
sub_sub_cls_comb = list(sub_sub_cls_comb)
codes_comb = cls_comb + sub_cls_comb + sub_sub_cls_comb
codes_comb = list(codes_comb)
msc_index = pd.DataFrame(codes_comb, columns=['code', 'description'])

Classes: 397 397, Sub-Classes: 5531 5531, Sub-Sub-Classes: 4647 4647


In [11]:
msc_cleaned

Unnamed: 0,msc
0,62J99
1,46N30
2,62
3,62G99
4,2010
...,...
449,68W25
450,62M02
451,65R30
452,42B35


In [12]:
dataset_codes = list(msc_cleaned.msc.values)
missing_codes = []
for code in dataset_codes:
    if code not in list(msc_index.code.values):
        missing_codes.append(code)

for idx, code in enumerate(missing_codes):
    missing_codes[idx] = missing_codes[idx].split(' ')
new_list = []
for code in missing_codes:
    for element in code:
        if not isinstance(element, list):
            new_list.append([element])
missing_codes = [item for sublist in new_list for item in sublist]
for idx, code in enumerate(missing_codes):
    if code not in list(msc_index.code.values):
        missing_codes.remove(code)
print(f'Remaining Missing Codes: {len(missing_codes)}')

Remaining Missing Codes: 47


In [13]:
missing_codes

['2010',
 '92F99',
 '68T05',
 '91E40',
 '62-XX',
 '97R60',
 '57Z25',
 '15B48',
 '54H30',
 '68T45',
 '49',
 '35Q68',
 '68W27',
 '49M41',
 '35CXX',
 '62-07l',
 '62R40',
 '35-XX',
 '62-08',
 '2020:',
 '49N45',
 '6006',
 '05C81',
 '65Zxx',
 '97N80',
 '53Z50',
 '91-10',
 '90C17',
 '62h30',
 '91G80',
 '68:',
 'science',
 '55N31',
 '78M32',
 '35K08',
 '60L10',
 '62G86',
 '97R40',
 '62D20',
 '05C21',
 '68W50',
 '97P30',
 '68',
 '93',
 '42-08',
 '68M25',
 '35J47']

In [78]:
msc_cleaned['description'] = None

In [14]:
correct = []
incorrect = []
for code in missing_codes:
    if re.match(r'\d\d-\d\d', code):
        correct.append(code)
    elif re.match(r'\d\d\w\w\w', code):
        correct.append(code)
    elif re.match(r'\d\d\w\d\d', code):
        correct.append(code)
    elif re.match(r'\d\d-\w\w', code):
        correct.append(code)
    else:
        incorrect.append(code)
incorrect = ['68Txx', '78-00', '60-06', '93-00', '62-00', '90-00']
correct = correct + incorrect
correct

['92F99',
 '68T05',
 '91E40',
 '62-XX',
 '97R60',
 '57Z25',
 '15B48',
 '54H30',
 '68T45',
 '35Q68',
 '68W27',
 '49M41',
 '35CXX',
 '62-07l',
 '62R40',
 '35-XX',
 '62-08',
 '49N45',
 '05C81',
 '65Zxx',
 '97N80',
 '53Z50',
 '91-10',
 '90C17',
 '62h30',
 '91G80',
 '55N31',
 '78M32',
 '35K08',
 '60L10',
 '62G86',
 '97R40',
 '62D20',
 '05C21',
 '68W50',
 '97P30',
 '42-08',
 '68M25',
 '35J47',
 '68Txx',
 '78-00',
 '60-06',
 '93-00',
 '62-00',
 '90-00']

In [18]:
for code in msc_index[msc_index.code.isin(correct)].code:
    if code in correct:
        correct.remove(code)

In [25]:
correct

['92F99',
 '62-XX',
 '97R60',
 '57Z25',
 '15B48',
 '54H30',
 '35Q68',
 '68W27',
 '49M41',
 '35CXX',
 '62-07l',
 '62R40',
 '35-XX',
 '62-08',
 '05C81',
 '65Zxx',
 '97N80',
 '53Z50',
 '91-10',
 '90C17',
 '62h30',
 '91G80',
 '55N31',
 '78M32',
 '35K08',
 '60L10',
 '62G86',
 '97R40',
 '62D20',
 '05C21',
 '68W50',
 '97P30',
 '42-08',
 '68M25',
 '35J47']

In [21]:
# Extract Description from MSC Classification Codes 2020
reader = PyPDF2.PdfReader(r'C:\Users\User\Desktop\ml_paper_recommender_system\paper-recommender\Classification Codes Data\msc2020.pdf')
missing_corpus = []
for idx, page in enumerate(reader.pages):
    text = page.extract_text()
    for idx, code in enumerate(correct):
        if code in text:
            missing_corpus.append(text)

In [22]:
for idx, text in enumerate(missing_corpus):
    missing_corpus[idx] = text.split('\n')

In [23]:
missing_descriptions = set()
for code in missing_codes:
    for idx, text in enumerate(missing_corpus):
        for line in text:
            if code in line:
                missing_descriptions.add(line)

In [32]:
msc_cleaned

Unnamed: 0,msc
0,62J99
1,46N30
2,62
3,62G99
4,2010
...,...
449,68W25
450,62M02
451,65R30
452,42B35


In [33]:
# Inspected manually because data quality right?
missing_descriptions = [
 '60B20: Random matrices (probabilistic aspects)',
 '62H22: Probabilistic graphical models',
 '97P30: Systems, databases (educational aspects)',
 '05C81: Random walks on graphs',
 '62B86: Statistical aspects of fuzziness, sufficiency, and information',
 '68T42: Agent technology and artificial intelligence',
 '97N80: Mathematical software, computer programs (educational aspects)',
 '60L10: Signatures and data streams',
 '35Q68: PDEs in connection with computer science',
 '68T07: Artificial neural networks and deep learning',
 '60L20: Rough paths',
 '62-08: Computational methods for problems pertaining to statistics',
 '35Q79: PDEs in connection with classical thermodynamics and heat transfer',
 '15B48: Positive matrices and their generalizations; cones of matrices',
 '68W50: Evolutionary algorithms, genetic algorithms (computational aspects)',
 '62-XX: Statistics',
 '65Zxx: Applications to the sciences',
 '62G86: Nonparametric inference and fuzziness',
 '62R40: Topological data analysis',
 '05C21: Flows in graphs',
 "35J08: Green's functions for elliptic equations",
 '91-10: Mathematical modeling or simulation for problems pertaining to game theory, economics, and Finance',
 '90-05: Experimental work for problems pertaining to operations research and mathematical programming',
 '68W27: Online algorithms; streaming algorithms',
 '14T10: Foundations of tropical geometry and relations with algebra',
 '62D20: Causal inference from observational studies',
 '49Q22: Optimal transportation',
 '35R02: PDEs on graphs and networks (ramified or polygonal spaces)',
 '57-08: Computational methods for problems pertaining to manifolds and cell complexes',
 '65F55: Numerical methods for low-rank matrix approximation; matrix compression',
 '90C17: Robustness in mathematical programming',
 '53Z50: Applications of differential geometry to data and computer science',
 '35J47: Second-order elliptic systems',
 '62R07: Statistical aspects of big data and data science',
 '15A83: Matrix completion problems',
 '91G80: Financial applications of other theories',
 '54H30: Applications of general topology to computer science (e.g., digital topology, image processing)',
 '35K08: Heat kernel',
 '90-10: Mathematical modeling or simulation for problems pertaining to operations research and mathematical programming',
 '78M32: Neural and heuristic methods applied to problems in optics and electromagnetic theory',
 '62A09: Graphical methods in statistics',
 '35Q74: PDEs in connection with mechanics of deformable solids',
 '55N31: Persistent homology and applications, topological data analysis',
 '68-XX: Computer science',
 '65D19: Computational issues in computer and robotic vision',
 '68M25: Computer security',
 '62-08: Computational methods for problems pertaining to statistics',
 '92-XX: Biology and other natural sciences',
 '35-XX: Partial differential equations',
 '68Q87: Probability in computer science (algorithm analysis, random structures, phase transitions, etc.)',
 '60B20: Random matrices (algebraic aspects) fFor probabilistic aspects, see 60B20 g',
 '91G70: Statistical methods; risk measures [See also 62P05, 62P20]',
 '62R01: Algebraic statistics',
 '49Q22: Transport equations fFor calculus of variations and optimal control, see 49Q22; for Fluid mechanics, see',
 '60B20: Random matrices (probabilistic aspects)',
 '49M41: PDE constrained optimization (numerical aspects)',
 '57Z25: Relations of manifolds and cell complexes with computer and data science',
 '49-11: Research data for problems pertaining to calculus of variations and optimal control',
 '35Q84: Fokker-Planck equations',
 '42-08: Computational methods for problems pertaining to harmonic analysis on Euclidean spaces',
 '68T09: Computational aspects of data analysis and big data',
 '65M22: Numerical solution of discretized equations for initial value and initial-boundary value problems involving ']

for idx, desc in enumerate(missing_descriptions):
    missing_descriptions[idx] = desc.split(':')

In [41]:
msc_index

Unnamed: 0,code,description
0,00-01,"Instructional exposition (textbooks, tutorial ..."
1,00-02,"Research exposition (monographs, survey articles)"
2,01-00,"General reference works (handbooks, dictionari..."
3,01-01,"Instructional exposition (textbooks, tutorial ..."
4,01-02,"Research exposition (monographs, survey articles)"
...,...,...
10570,97U50,Computer assisted instruction and programmed i...
10571,97U60,Manipulative materials and their use in the cl...
10572,97U70,"Technological tools (computers, calculators, s..."
10573,97U80,Audiovisual media and their use in instruction


In [42]:
for desc in missing_descriptions:
    msc_index.loc[-1] = desc
    msc_index.index = msc_index.index + 1
    msc_index = msc_index.sort_index()

In [44]:
for desc in missing_descriptions:
    if desc[0] in msc_index.code.values:
        pass
    else:
        print(desc[0])

In [48]:
# remaining = []
for code in msc_cleaned.values:
    if code in msc_index.code.values:
        pass
    else:
        print(code)

['62']
['2010']
['92B25 92F99']
['68T05 91E40']
['10010147.10010257.10010258.10010259.10010263']
['97R60']
['92']
['94']
['68T45 68T07']
['49']
['35CXX']
['62-07l']
['2020: 49N45']
['6006']
['90']
['68T04']
['62h30']
['00']
['68: computer science']
['68Tx']
['97R40']
['60']
['68']
['78']
['93']
['6804']


In [49]:
# Split and add misplaced string back to ACM codes
print(f'Before: {len(ccs_cleaned)}')
acm_misplaced_string = '10010147.10010257.10010258.10010259.10010263'.split('.')
for str in acm_misplaced_string:
    ccs_cleaned.loc[-1] = str
    ccs_cleaned.index = ccs_cleaned.index + 1
    ccs_cleaned = ccs_cleaned.sort_index()
print(f'After: {len(ccs_cleaned)}')

Before: 162
After: 167


In [25]:
rem = msc_cleaned[msc_cleaned['description'].isnull()]['msc'].values
rem = ['60-06', '68T05', '91E40', '92B25', '92F99', '35CXX', '97R60', '68-04',
       '68T45', '68T07', '62h30', '97R40', '62-07', '68T04']
found_rem = []
for r in rem:
    desc = msc_cleaned['description'][idx]
    if r in msc_cleaned['msc'].values:
        print(f'Found: {r}')
        desc = msc_cleaned[msc_cleaned['msc'] == r]['description'].values[0]
        print(f'Description: {desc}')
        if desc is not None:
            found_rem.append(r)
    else:
        print(f'Not found: {r}')

Found: 60-06
Description: None
Found: 68T05
Description: None
Found: 91E40
Description: None
Not found: 92B25
Not found: 92F99
Found: 35CXX
Description: None
Found: 97R60
Description: None
Found: 68-04
Description: None
Found: 68T45
Description: None
Found: 68T07
Description: Artificial neural networks and deep learning
Found: 62h30
Description: None
Found: 97R40
Description: None
Found: 62-07
Description: None
Found: 68T04
Description: None


In [26]:
rem = set(rem) - set(found_rem)
rem = list(rem)
rem

['97R40',
 '68-04',
 '68T04',
 '35CXX',
 '97R60',
 '62-07',
 '91E40',
 '62h30',
 '68T45',
 '68T05',
 '92B25',
 '92F99',
 '60-06']

In [None]:
# Don't append before finishing TODO above
missing_descriptions.append([['97R60', 'Mathematics education'],
                             ['68-04', 'Mathematics education'], 
                            ['92B25', 'Biology and other natural sciences 92Bxx Mathematical biology in general 92B25 Biological rhythms and synchronization'],
                            ['68T04', 'Learning and adaptive systems in artificial intelligence'],
                            ['97R40', 'Mathematics education'],
                            ['92F99', 'Other natural sciences (mathematical treatment)'],
                            ['62H30', 'Classification and discrimination; cluster analysis (statistical aspects); mixture models'],
                            ['35CXX', 'Representations of solutions to partial differential equations']])

In [27]:
missing_descriptions

[['60B20', ' Random matrices (probabilistic aspects)'],
 ['62H22', ' Probabilistic graphical models'],
 ['97P30', ' Systems, databases (educational aspects)'],
 ['05C81', ' Random walks on graphs'],
 ['62B86', ' Statistical aspects of fuzziness, sufficiency, and information'],
 ['68T42', ' Agent technology and artificial intelligence'],
 ['97N80', ' Mathematical software, computer programs (educational aspects)'],
 ['60L10', ' Signatures and data streams'],
 ['35Q68', ' PDEs in connection with computer science'],
 ['68T07', ' Artificial neural networks and deep learning'],
 ['60L20', ' Rough paths'],
 ['62-08', ' Computational methods for problems pertaining to statistics'],
 ['35Q79',
  ' PDEs in connection with classical thermodynamics and heat transfer'],
 ['15B48', ' Positive matrices and their generalizations; cones of matrices'],
 ['68W50',
  ' Evolutionary algorithms, genetic algorithms (computational aspects)'],
 ['62-XX', ' Statistics'],
 ['65Zxx', ' Applications to the scienc