In [15]:
import pandas as pd 
import numpy as np
import ast
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

In [16]:
path = r'Classification Codes Data\msc_codes.txt'

In [17]:
with open(path, 'r', encoding='utf8') as f:
    msc = f.readlines()

In [18]:
codes = []
descriptions = []
for entry in msc:
    code, description = entry.split(' ', 1)
    codes.append(code)
    descriptions.append(description.replace('\n', ''))
    assert len(codes) == len(descriptions)

In [19]:
def remove_brackets(text):
  pattern = re.compile(r'\[.+\]|\{.+\}')
  return pattern.sub('', text)

In [20]:
for idx, description in enumerate(descriptions):
    descriptions[idx] = remove_brackets(description)

In [21]:
df = pd.DataFrame({'code': codes, 'description': descriptions})

In [22]:
def extract_identifier(code):
    identifier_0 = code[:2]
    identifier_1 = code[2:3]
    identifier_2 = code[3:5]
    return [identifier_0, identifier_1, identifier_2]
df['identifier'] = df['code'].apply(extract_identifier)

In [23]:
def heirarchy(identifier):
    if re.match(r'\b^[0-9][0-9]\b', identifier[0]) and (identifier[1] == '' and identifier[2] == ''):
        return 1
    if re.match(r'\b^[0-9][0-9]\b', identifier[0]) and (identifier[2] == 'XX'):
        return 1
    if (re.match(r'\b^[0-9][0-9]\b', identifier[0]) and re.match(r'\b^[0-9][0-9]\b', identifier[0])) and (identifier[1] == '-'):
        return 2
    if re.match(r'\b^[0-9][0-9]\b', identifier[0]) and re.match(r'\b^[A-Z]\b', identifier[1]) and re.match(r'\b^xx\b', identifier[2]):
        return 3
    else:
        return 4
df['heirarchy'] = df['identifier'].apply(heirarchy).astype(int)

In [24]:
df.head()

Unnamed: 0,code,description,identifier,heirarchy
0,0,General and overarching topics; collections,"[00, , ]",1
1,1,History and biography,"[01, , ]",1
2,3,Mathematical logic and foundations,"[03, , ]",1
3,5,Combinatorics,"[05, , ]",1
4,45,Integral equations,"[45, , ]",1


In [25]:
cls = df[df['heirarchy'] == 1]
subcls = df[df['heirarchy'] == 2]
subsubcls = df[df['heirarchy'] == 3]
subsubsubcls = df[df['heirarchy'] == 4]

In [72]:
def construct_full_text(identifier):

    code = ''.join(identifier)

    full_text = ''

    if code not in df['code'].values:
        raise ValueError('Code not found in the dataset')
    
    if heirarchy(identifier) == 1:
        full_text = f"Class: {df[df['code'] == code]['description'].values[0].strip()}"
    
    if heirarchy(identifier) == 2:
        full_text = f"Class: {df[df['code'] == code[:2] + '-XX']['description'].values[0].strip()} Subclass: {df[df['code'] == code]['description'].values[0].strip()}"
    
    if heirarchy(identifier) == 3:
        full_text = f"Class: {df[df['code'] == code[:2] + '-XX']['description'].values[0].strip()} Subclass: {df[df['code'] == code]['description'].values[0].strip()}"

    if heirarchy(identifier) == 4:
        full_text = f"Class: Class: {df[df['code'] == code[:2] + '-XX']['description'].values[0].strip()} Subclass: {df[df['code'] == code[:3] + 'xx']['description'].values[0].strip()} Subsubclass: {df[df['code'] == code]['description'].values[0].strip()}"

    return full_text.strip()

In [74]:
print(construct_full_text(['97', 'K', 'xx']))

Class: Mathematics education Subclass: Education of combinatorics, graph theory, probability theory, and statistics


In [71]:
df[df['heirarchy']==3]

Unnamed: 0,code,description,identifier,heirarchy
66,00Axx,General and miscellaneous specific topics,"[00, A, xx]",3
88,00Bxx,Conference proceedings and collections of arti...,"[00, B, xx]",3
105,01Axx,History of mathematics and mathematicians,"[01, A, xx]",3
147,03Axx,Philosophical aspects of logic and foundations,"[03, A, xx]",3
151,03Bxx,General logic,"[03, B, xx]",3
...,...,...,...,...
6621,97Kxx,"Education of combinatorics, graph theory, prob...","[97, K, xx]",3
6631,97Mxx,Education of mathematical modeling and applica...,"[97, M, xx]",3
6641,97Nxx,Education of numerical mathematics,"[97, N, xx]",3
6651,97Pxx,Computer science (educational aspects),"[97, P, xx]",3
