In [109]:
import pandas as pd 
import numpy as np
import ast
import re
from bs4 import BeautifulSoup
import PyPDF2
from sentence_transformers import SentenceTransformer, util

In [2]:
model = SentenceTransformer('all-mpnet-base-v2')

#### Tags Text Cleaning

In [3]:
def clean_categories(categories):
    tags = [tag.strip() for tag in ast.literal_eval(categories)]
    return tags

In [4]:
df = pd.read_csv(r'arxiv-paper-abstracts\arxiv_data_210930-054931.csv')
categories = pd.DataFrame(list(df['terms'].unique()),columns=['categories'])
categories['clean_categories'] = categories['categories'].apply(clean_categories)
tags = set()
for index, row in categories.iterrows():
    categories = row['clean_categories']
    for category in categories:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        tags.add(category)
tags_df = pd.DataFrame(list(tags), columns=['tags'])
tags = set()
for index, row in tags_df.iterrows():
    categories_str = row['tags']
    categories_list = [category.strip() for category in categories_str.replace(',', ';').split(';')]
    for category in categories_list:
        if '(' in category and ')' in category:
            subcategories = [subcat.strip() for subcat in category.split() if subcat[0].isdigit()]
            for subcategory in subcategories:
                tags.add(subcategory)
        else:
            tags.add(category)
tags.discard('')


In [5]:
tags_list = []
for tag in tags:
    tag = tag.replace('(Primary)', '')
    tag = tag.replace('Primary', '')
    tag = tag.replace('(Secondary)', '')
    tag = tag.replace('(secondary)', '')
    tag = tag.replace('secondary', '')
    tag = tag.replace('Secondary', '')
    tag = tag.replace('and ', '')
    tags_list.append(tag.strip())
print(tags_list)
    

['I.4.10', '93B30', '62J02', '60C05', '35J08', '62J99', 'B.7.1', '68W99', '68U35', '78', '49-06', '62H17', '46E22', 'D.3.4', 'I.2.8', '65D10', 'I.2.2', 'cond-mat', '92', 'J.5', '90C05', 'cond-mat.other', 'q-bio.GN', '68P30', 'hep-ex', '68W50', '60G35', '41A52', 'astro-ph.SR', 'H.2.8', '15B48', 'physics.pop-ph', '68: computer science', '62F03', '62G08', '35J47', '65Zxx', '62F40', 'C.2', 'math.ST', 'cs.GT', 'math.AC', '68Q25', 'H.3.4', '60E05', 'physics.geo-ph', '68T05 91E40', '49M41', 'I.2.10', '60', '60L20', '90C26', '60H30', 'cond-mat.soft', '60G15', '47A60', '90C90', 'J.m', 'I.2', '55U10', '62h30', 'math.CO', '34A99', 'Computer science applications- 97R60 Computer graphics', '05C60', 'cs.OH', 'nlin.CG', 'I.2.1', '62F35', 'D.2.11', '91G80', '74S05', '33E10', '92D20', '68W27', '68N30', 'math.RT', 'org', '34C20', 'hep-ph', '90-06', '74B05', '49L20', 'J.0', '00', '68T05', 'I.2.10.f', '54H30', 'F.4.2', 'I.4.9', '62-08', 'q-fin.TR', '60-06', '68-01', '97C30', '30C40', '60G55', '65T60', '90

In [6]:
# create empty lists for each classification
msc = set()
ccs = set()
arxiv = set()
misc = set()

for tag in tags_list:
    if re.match(r'\d{2}[A-Z]\d{2}|\d{2}[A-Z]-xx|\d{2}\w{2,}', tag):
        msc.add(tag)
    elif re.match(r'\d{2}-\d{2}|\d{2}-xx|\d{2}-XX', tag):
        msc.add(tag)
    elif re.match(r'\d{2}', tag):
        msc.add(tag)
    elif re.match(r'[A-Z]\.\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\.\d', tag):
        ccs.add(tag)
    elif re.match(r'[A-Z]\d\.\d', tag):
        ccs.add(tag)
    elif re.match(r'\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+-\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+\.\w+', tag):
        arxiv.add(tag)
    elif re.match(r'\w+-\w+', tag):
        arxiv.add(tag)
    else:
        misc.add(tag)


In [7]:
print(f'MSC: {len(msc)}\nCCS: {len(ccs)}\nArXiv: {len(arxiv)}\nMisc: {len(misc)}')

MSC: 453
CCS: 162
ArXiv: 149
Misc: 5


**Contents of misc:**
- Artificial intelligence': corresponds to 68Txx
- Computer science applications- 97R60 Computer graphics': corresponds to 97R60
- Computing methodologies for image processing': corresponds to 68U10
- This paper tells us how human can be identified by their Gait cycle\n  using any simple camera': dropped
- org: dropped

In [8]:
msc.add('97R60')
del misc

In [9]:
msc_cleaned = pd.DataFrame(list(msc), columns=['msc'])
ccs_cleaned = pd.DataFrame(list(ccs), columns=['ccs'])
arxiv_cleaned = pd.DataFrame(list(arxiv), columns=['arxiv'])

#### Retrieve Descriptions for Each Tag

In [40]:
msc_html = r'C:\Users\User\Desktop\ml_paper_recommender_system\paper-recommender\MSC Classification Codes.html'

# Parse MSC Classification Codes html file
with open(msc_html) as fp:
    msc_soup = BeautifulSoup(fp, 'html.parser')

# Extract MSC Classification Codes
msc_tree = [element.text for element in msc_soup.find_all('li')]

# Define Dictionary for MSC Classification Codes where the key is the main class and the value is a list of sub-classes
msc_classes = {}

# Populate the dictionary
for cls in msc_tree:
    if re.match(r'^\d\d-xx:', cls):
        msc_classes[cls.split('\n')[0]] = []

for element in msc_tree:
    for cls in msc_classes.keys():
        if cls[0:2] == element[0:2]:
            msc_classes[cls].append(element)

for cls in msc_classes.keys():
    msc_classes[cls].pop(0)

# Create a dataframe from the dictionary
msc_df = pd.DataFrame(msc_classes.items(),columns=['class','sub_classes'])

In [41]:
for i, subclass in enumerate(msc_df['sub_classes']):
    for idx, sub in enumerate(subclass):
        if re.match(r'\d+\d+[A-Z]xx', sub):
            msc_df['sub_classes'][i][idx] = sub.split('\n')[0]

In [42]:
for i, subclass in enumerate(msc_df['sub_classes']):
    curr_cls = msc_df['class'][i]
    for idx, sub in enumerate(subclass):
        msc_df['sub_classes'][i][idx] = curr_cls + ' ' + msc_df['sub_classes'][i][idx]

In [60]:
for idx, tag in enumerate(msc_cleaned['msc']):
    if ':' in tag:
        msc_cleaned['msc'][idx] = tag.split(':')[0]

In [54]:
target_string = "example"
def check_substring(string):
    return target_string in string
check_substring("this is an example")

True

In [72]:
len(msc_df['sub_classes'][8])

100

In [99]:
msc_cleaned['description'] = None

In [100]:
for i, tag in enumerate(msc_cleaned['msc']):
    for j, subclass in enumerate(msc_df['sub_classes']):
        for k, cls in enumerate(subclass):
            if tag in cls:
                msc_cleaned['description'][i] = cls

In [108]:
missing_msc = []
for idx, description in enumerate(msc_cleaned['description']):
    if description is None:
        missing_msc.append(msc_cleaned['msc'][idx])
print(f'Status:\n{len(missing_msc)} out of {len(msc_cleaned)} tags are missing a description')

Status
72 out of 454 tags are missing a description


In [117]:
print(missing_msc)

['35J08', '49-11', '68W50', '15B48', '35J47', '65Zxx', '49M41', '68T05 91E40', '60L20', '62h30', '91G80', '68W27', '54H30', '62-08', '90-10', '15A83', '97P30', '90C17', '62H22', '91G70', '68T04', '2010', '62D20', '62A09', '05C81', '65F55', '65D19', '53Z50', '68Q87', '62-07l', '68T45 68T07', '2020', '6804', '68T42', '55N31', '35Q84', '62R07', '97N80', '68T07', '62-XX', '42-08', '90-05', '60L10', '49Q22', '35Q79', '35Q74', '78M32', '62G86', '60B20', '92B25 92F99', '91-10', '35-XX', '57Z25', '14T10', '68-XX', '62B86', '68M25', '97R40', '68T09', '92-XX', '35CXX', '35K08', '05C21', '10010147.10010257.10010258.10010259.10010263', '97R60', '62R40', '35R02', '62R01', '6006', '57-08', '35Q68', '65M22']


In [119]:
# Extract Description from MSC Classification Codes 2020
reader = PyPDF2.PdfReader(r'C:\Users\User\Desktop\ml_paper_recommender_system\paper-recommender\msc2020.pdf')
missing_corpus = []
for idx, page in enumerate(reader.pages):
    text = page.extract_text()
    for idx, code in enumerate(missing_msc):
        if code in text:
            missing_corpus.append(text)
            print(f'Page {idx+1}: {code}')

Page 32: 2020
Page 32: 2020
Page 32: 2020
Page 55: 68-XX
Page 63: 05C21
Page 25: 05C81
Page 49: 60B20
Page 54: 14T10
Page 16: 15A83
Page 4: 15B48
Page 49: 60B20
Page 52: 35-XX
Page 52: 35-XX
Page 1: 35J08
Page 5: 35J47
Page 62: 35K08
Page 44: 49Q22
Page 46: 35Q74
Page 71: 35Q68
Page 36: 35Q84
Page 45: 35Q79
Page 67: 35R02
Page 60: 92-XX
Page 41: 42-08
Page 52: 35-XX
Page 2: 49-11
Page 7: 49M41
Page 44: 49Q22
Page 28: 53Z50
Page 13: 54H30
Page 35: 55N31
Page 59: 68T09
Page 66: 62R40
Page 70: 57-08
Page 53: 57Z25
Page 52: 35-XX
Page 40: 62-XX
Page 52: 35-XX
Page 60: 92-XX
Page 49: 60B20
Page 9: 60L20
Page 43: 60L10
Page 60: 92-XX
Page 14: 62-08
Page 24: 62A09
Page 40: 62-XX
Page 56: 62B86
Page 23: 62D20
Page 19: 62H22
Page 48: 62G86
Page 35: 55N31
Page 37: 62R07
Page 59: 68T09
Page 60: 92-XX
Page 66: 62R40
Page 68: 62R01
Page 14: 62-08
Page 26: 65F55
Page 27: 65D19
Page 72: 65M22
Page 6: 65Zxx
Page 55: 68-XX
Page 57: 68M25
Page 29: 68Q87
Page 34: 68T42
Page 35: 55N31
Page 37: 62R07
Page 

In [121]:
for idx, text in enumerate(missing_corpus):
    missing_corpus[idx] = text.split('\n')


In [130]:
missing_descriptions = set()
for code in missing_msc:
    for idx, text in enumerate(missing_corpus):
        for line in text:
            if code in line:
                desc = f'{code}: {line}'
                missing_descriptions.add(desc)

In [136]:
missing_descriptions = [
 '60B20: 60B20 Random matrices (probabilistic aspects) fFor algebraic aspects, see 15B52 g',
 '62H22: 62H22 Probabilistic graphical models',
 '97P30: 97P30 Systems, databases (educational aspects)',
 '05C81: 05C81 Random walks on graphs',
 '62B86: 62B86 Statistical aspects of fuzziness, sufficiency, and information',
 '68T42: 68T42 Agent technology and artificial intelligence',
 '97N80: 97N80 Mathematical software, computer programs (educational aspects)',
 '60L10: 60L10 Signatures and data streams',
 '35Q68: 35Q68 PDEs in connection with computer science',
 '68T07: 68T07 Artificial neural networks and deep learning',
 '60L20: 60L20 Rough paths',
 '62-08: 62-08 Computational methods for problems pertaining to statistics',
 '35Q79: 35Q79 PDEs in connection with classical thermodynamics and heat transfer',
 '15B48: 15B48 Positive matrices and their generalizations; cones of matrices',
 '68W50: 68W50 Evolutionary algorithms, genetic algorithms (computational aspects) [See also 68T05, 68T20, 90C59]',
 '62-XX: 62-XX Statistics',
 '65Zxx: 65Zxx Applications to the sciences',
 '62G86: 62G86 Nonparametric inference and fuzziness',
 '62R40: 62R40 Topological data analysis [See also 55N31]',
 '05C21: 05C21 Flows in graphs',
 "35J08: 35J08 Green's functions for elliptic equations",
 '91-10: 91-10 Mathematical modeling or simulation for problems pertaining to game theory, economics, and \x0cnance',
 '90-05: 90-05 Experimental work for problems pertaining to operations research and mathematical programming',
 '68W27: 68W27 Online algorithms; streaming algorithms',
 '14T10: 14T10 Foundations of tropical geometry and relations with algebra fFor algebraic aspects, see 15A80 g',
 '62D20: 62D20 Causal inference from observational studies',
 '49Q22: 49Q22 Optimal transportation [See also 90B06]',
 '35R02: 35R02 PDEs on graphs and networks (rami\x0ced or polygonal spaces)',
 '57-08: 57-08 Computational methods for problems pertaining to manifolds and cell complexes',
 '65F55: 65F55 Numerical methods for low-rank matrix approximation; matrix compression',
 '90C17: 90C17 Robustness in mathematical programming',
 '53Z50: 53Z50 Applications of di\x0berential geometry to data and computer science',
 '35J47: 35J47 Second-order elliptic systems',
 '62R07: 62R07 Statistical aspects of big data and data science fFor computer science aspects, see 68T09; for information-',
 '15A83: 15A83 Matrix completion problems',
 '91G80: 91G80 Financial applications of other theories [See also 35Q91, 37N40, 49N90, 60J70, 60K10, 60H30, 93E20]',
 '54H30: 54H30 Applications of general topology to computer science (e.g., digital topology, image processing) [See also',
 '35K08: 35K08 Heat kernel',
 '90-10: 90-10 Mathematical modeling or simulation for problems pertaining to operations research and mathematical pro-',
 '78M32: 78M32 Neural and heuristic methods applied to problems in optics and electromagnetic theory',
 '62A09: 62A09 Graphical methods in statistics',
 '35Q74: 35Q74 PDEs in connection with mechanics of deformable solids',
 '55N31: 68T09 Computational aspects of data analysis and big data [See also 62R07] fFor homological aspects, see 55N31 g',
 '68-XX: 68-XX Computer science fFor papers containing software, source code,',
 '62R07: 68T09 Computational aspects of data analysis and big data [See also 62R07] fFor homological aspects, see 55N31 g',
 '65D19: 65D19 Computational issues in computer and robotic vision',
 '68M25: 68M25 Computer security',
 '62-08: 65C20 Probabilistic models, generic numerical methods in probability and statistics [See also 60-08, 62-08]',
 '55N31: 55N31 Persistent homology and applications, topological data analysis [See also 62R40, 68T09]',
 '92-XX: 92-XX Biology and other natural sciences',
 '35-XX: 35-XX Partial differential equations',
 '68Q87: 68Q87 Probability in computer science (algorithm analysis, random structures, phase transitions, etc.) [See also',
 '60B20: 15B52 Random matrices (algebraic aspects) fFor probabilistic aspects, see 60B20 g',
 '91G70: 91G70 Statistical methods; risk measures [See also 62P05, 62P20]',
 '62R01: 62R01 Algebraic statistics',
 '49Q22: 35Q49 Transport equations fFor calculus of variations and optimal control, see 49Q22; for Fluid mechanics, see',
 '60B20: 05C80 Random graphs (graph-theoretic aspects) [See also 60B20]',
 '68T09: 94A16 Informational aspects of data analysis and big data [See also 62R07, 68T09] fFor homological aspects, see',
 '49M41: 49M41 PDE constrained optimization (numerical aspects)',
 '57Z25: 57Z25 Relations of manifolds and cell complexes with computer and data science',
 '49-11: 49-11 Research data for problems pertaining to calculus of variations and optimal control',
 '35Q84: 35Q84 Fokker-Planck equations For Fluid mechanics, see 76X05, 76W05; for statistical mechanics, see 82C31',
 '42-08: 42-08 Computational methods for problems pertaining to harmonic analysis on Euclidean spaces',
 '68T09: 68T09 Computational aspects of data analysis and big data [See also 62R07] For homological aspects, see 55N31',
 '65M22: 65M22 Numerical solution of discretized equations for initial value and initial-boundary value problems involving']

for idx, desc in enumerate(missing_descriptions):
    missing_descriptions[idx] = desc.split(':')
missing_descriptions


[['60B20',
  ' 60B20 Random matrices (probabilistic aspects) fFor algebraic aspects, see 15B52 g'],
 ['62H22', ' 62H22 Probabilistic graphical models'],
 ['97P30', ' 97P30 Systems, databases (educational aspects)'],
 ['05C81', ' 05C81 Random walks on graphs'],
 ['62B86',
  ' 62B86 Statistical aspects of fuzziness, sufficiency, and information'],
 ['68T42', ' 68T42 Agent technology and artificial intelligence'],
 ['97N80',
  ' 97N80 Mathematical software, computer programs (educational aspects)'],
 ['60L10', ' 60L10 Signatures and data streams'],
 ['35Q68', ' 35Q68 PDEs in connection with computer science'],
 ['68T07', ' 68T07 Artificial neural networks and deep learning'],
 ['60L20', ' 60L20 Rough paths'],
 ['62-08',
  ' 62-08 Computational methods for problems pertaining to statistics'],
 ['35Q79',
  ' 35Q79 PDEs in connection with classical thermodynamics and heat transfer'],
 ['15B48',
  ' 15B48 Positive matrices and their generalizations; cones of matrices'],
 ['68W50',
  ' 68W50 E

[['60B20',
  ' 60B20 Random matrices (probabilistic aspects) fFor algebraic aspects, see 15B52 g'],
 ['62H22', ' 62H22 Probabilistic graphical models'],
 ['97P30', ' 97P30 Systems, databases (educational aspects)'],
 ['05C81', ' 05C81 Random walks on graphs'],
 ['62B86',
  ' 62B86 Statistical aspects of fuzziness, sufficiency, and information'],
 ['68T42', ' 68T42 Agent technology and artificial intelligence'],
 ['97N80',
  ' 97N80 Mathematical software, computer programs (educational aspects)'],
 ['60L10', ' 60L10 Signatures and data streams'],
 ['35Q68', ' 35Q68 PDEs in connection with computer science'],
 ['68T07', ' 68T07 Artificial neural networks and deep learning'],
 ['60L20', ' 60L20 Rough paths'],
 ['62-08',
  ' 62-08 Computational methods for problems pertaining to statistics'],
 ['35Q79',
  ' 35Q79 PDEs in connection with classical thermodynamics and heat transfer'],
 ['15B48',
  ' 15B48 Positive matrices and their generalizations; cones of matrices'],
 ['68W50',
  ' 68W50 E