In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
soup = BeautifulSoup(open('classification_codes_data\ACM Classification Codes.html'), 'html.parser')
classification_codes = soup.find_all('li', id=lambda value: value and value.startswith('code:'))
table = []
unique_codes = set()
for code in classification_codes:
    code_description = code.get_text().split('\n')
    for description in code_description:
        unique_codes.add(description)


In [3]:
classes = []
description = []
for code in unique_codes:
    classes.append(code.split(':')[0])
    description.append(code.split(':')[1])
    assert len(classes) == len(description)

df = pd.DataFrame({'class': classes, 'description': description})

In [4]:
def clean_code(code):
    code = code.split('.')
    return code

df['identifier'] = df['class'].apply(clean_code)

In [5]:
print(df.values)

[['F.4' ' MATHEMATICAL LOGIC AND FORMAL LANGUAGES' list(['F', '4'])]
 ['D.4.1' ' Process Management' list(['D', '4', '1'])]
 ['C.0' ' GENERAL' list(['C', '0'])]
 ...
 ['K.1' ' THE COMPUTER INDUSTRY' list(['K', '1'])]
 ['G.1.m' ' Miscellaneous' list(['G', '1', 'm'])]
 ['H.3.5' ' Online Information Services' list(['H', '3', '5'])]]


In [9]:
#df.to_csv('acm_classification.csv', index=False)

In [6]:
def heirarchy(identifier):
    if (len(identifier) == 1):
        identifier.append('')
        identifier.append('')
    if (len(identifier) == 2):
        identifier.append('')
    
    if (identifier[0] != '' and identifier[1] == '' and identifier[2] == ''):
        return 1
    if (identifier[0] != '' and identifier[1] != '' and identifier[2] == ''):
        return 2
    if (identifier[0] != '' and identifier[1] != '' and identifier[2] != ''):
        return 3

df['heirarchy'] = df.identifier.apply(heirarchy)

In [7]:
def construct_full_text(identifier):

    if isinstance(identifier, str):
        identifier = identifier.split('.')

    code = '.'.join(identifier)

    if code not in df['class'].values:
        raise ValueError('Code not found in the dataset')
    
    if heirarchy(identifier) == 1:
        full_text = f"Class: {df[df['class'] == code]['description'].values[0].strip().capitalize()}"
    
    if heirarchy(identifier) == 2:
        cls = f"{identifier[0]}."
        full_text = f"Class: {df[df['class']==cls]['description'].values[0].strip().capitalize()} Subclass: {df[df['class'] == code]['description'].values[0].strip().capitalize()}"
    
    if heirarchy(identifier) == 3:
        cls = f"{identifier[0]}."
        subcls = f"{identifier[0]}.{identifier[1]}"
        full_text = f"Class: {df[df['class']==cls]['description'].values[0].strip().capitalize()} Subclass: {df[df['class'] == subcls]['description'].values[0].strip().capitalize()} Subsubclass: {df[df['class'] == code]['description'].values[0].strip().capitalize()}"

    return full_text

In [8]:
df['full_text'] = df['class'].apply(construct_full_text)

In [10]:
df.drop(['description','identifier','heirarchy'], axis=1, inplace=True)

In [11]:
df.to_csv('acm_classification.csv', index=False)