In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [33]:
soup = BeautifulSoup(open('classification_codes_data\ACM Classification Codes.html'), 'html.parser')
classification_codes = soup.find_all('li', id=lambda value: value and value.startswith('code:'))
table = []
unique_codes = set()
for code in classification_codes:
    code_description = code.get_text().split('\n')
    for description in code_description:
        unique_codes.add(description)


In [34]:
classes = []
description = []
for code in unique_codes:
    classes.append(code.split(':')[0])
    description.append(code.split(':')[1])
    assert len(classes) == len(description)

df = pd.DataFrame({'class': classes, 'description': description})

In [35]:
def clean_code(code):
    code = code.split('.')
    return code

df['identifier'] = df['class'].apply(clean_code)

In [36]:
print(df.values)

[['H.2.5' ' Heterogeneous Databases' list(['H', '2', '5'])]
 ['G.2.m' ' Miscellaneous' list(['G', '2', 'm'])]
 ['H.1.0' ' General' list(['H', '1', '0'])]
 ...
 ['I.2.7' ' Natural Language Processing' list(['I', '2', '7'])]
 ['C.5.1' " Large and Medium (``Mainframe'') Computers"
  list(['C', '5', '1'])]
 ['F.4.1' ' Mathematical Logic' list(['F', '4', '1'])]]


In [9]:
#df.to_csv('acm_classification.csv', index=False)

In [40]:
def heirarchy(identifier):
    if (len(identifier) == 1):
        identifier.append('')
        identifier.append('')
    if (len(identifier) == 2):
        identifier.append('')
    
    if (identifier[0] != '' and identifier[1] == '' and identifier[2] == ''):
        return 1
    if (identifier[0] != '' and identifier[1] != '' and identifier[2] == ''):
        return 2
    if (identifier[0] != '' and identifier[1] != '' and identifier[2] != ''):
        return 3

df['heirarchy'] = df.identifier.apply(heirarchy)

In [75]:
def construct_full_text(identifier):

    if isinstance(identifier, str):
        identifier = identifier.split('.')

    code = '.'.join(identifier)

    if code not in df['class'].values:
        raise ValueError('Code not found in the dataset')
    
    if heirarchy(identifier) == 1:
        full_text = f"Class: {df[df['class'] == code]['description'].values[0].strip().capitalize()}"
    
    if heirarchy(identifier) == 2:
        cls = f"{identifier[0]}."
        full_text = f"Class: {df[df['class']==cls]['description'].values[0].strip().capitalize()} Subclass: {df[df['class'] == code]['description'].values[0].strip().capitalize()}"
    
    if heirarchy(identifier) == 3:
        cls = f"{identifier[0]}."
        subcls = f"{identifier[0]}.{identifier[1]}"
        full_text = f"Class: {df[df['class']==cls]['description'].values[0].strip().capitalize()} Subclass: {df[df['class'] == subcls]['description'].values[0].strip().capitalize()} Subsubclass: {df[df['class'] == code]['description'].values[0].strip().capitalize()}"

    return full_text

In [74]:
df['full_text'] = df['class'].apply(construct_full_text)

In [77]:
for row in df.iterrows():
    print(row[1]['full_text'])

Class: Information systems Subclass: Database management Subsubclass: Heterogeneous databases
Class: Mathematics of computing Subclass: Discrete mathematics Subsubclass: Miscellaneous
Class: Information systems Subclass: Models and principles Subsubclass: General
Class: Theory of computation Subclass: Logics and meanings of programs
Class: Software Subclass: Programming techniques Subsubclass: Concurrent programming
Class: Hardware Subclass: Input/output and data communications Subsubclass: Reliability, testing, and fault-tolerance
Class: Software Subclass: Software engineering Subsubclass: Requirements/specifications
Class: Computer systems organization Subclass: Processor architectures Subsubclass: Multiple data stream architectures (multiprocessors)
Class: Software Subclass: Software engineering Subsubclass: Distribution, maintenance, and enhancement
Class: Computing methodologies Subclass: Symbolic and algebraic manipulation Subsubclass: Algorithms
Class: Computing methodologies Su