This notebook aims at exploring all the possibilities for parsing CPC titles into title-hyponyms structure.

# Data preprocessing

In [1]:
from anytree import Node, RenderTree
import re 

import pandas as pd

In [5]:
global dict_lvl
dict_lvl = {1: -3, 3: -2, 4: -1, 6: 0}

def convert_to_int(x):
    lvl = x['lvl']
    try:
        return int(lvl)
    except ValueError:
        title = x['title']
        if len(title) == 1:
            return -3
        elif len(title) == 3:
            return -2
        elif len(title) == 4:
            return -1

def clean_descr(description):
    """
    1. remove references from the description, such as (preserving A23B; obtaining protein compositions for foodstuffs A23J1/00;)
    2. remove '{' and '}'
    """
    description = description.replace('{', '').replace('}','')
    description = re.sub(r'\ ?\([\w\W]*([A-Z]{1}[0-9]{2}[A-Z]{1}[0-9]*[\/]*[0-9])*[\w\W]*\)', '', description)
    return description

def read_label_file(file_name, max_level=4):
    df = pd.read_csv(file_name, header=None, sep='\t', dtype=object, names=['title', 'lvl', 'description'])
    df['lvl'] = df.apply(convert_to_int, axis=1)

    if max_level in [1,3,4,6]:
        df = df[df['lvl']<=dict_lvl[max_level]]

    df['description'] = df['description'].apply(clean_descr)
    return df.reset_index(drop=True)

def find_father(df, child_lvl):
    for i, row in df[::-1].iterrows():
        if row.lvl + 1 == child_lvl:
            return row.title

def build_tree(df):
    node_dict = {}
    root_title = df.loc[0, 'title']
    root_desc = df.loc[0, 'description']
    node_dict[root_title] = Node(root_desc)

    for i, row in df[1:].iterrows():
        child_lvl = row.lvl
        child_title = row.title
        child_desc = row.description

        father_title = find_father(df[:i], child_lvl)
        node_dict[child_title] = Node(child_desc, parent = node_dict[father_title])
    return node_dict[root_title]

In [6]:
A_df = read_label_file('cpc-titles/cpc-section-A_20220201.txt')
B_df = read_label_file('cpc-titles/cpc-section-B_20220201.txt')
C_df = read_label_file('cpc-titles/cpc-section-C_20220201.txt')
D_df = read_label_file('cpc-titles/cpc-section-D_20220201.txt')
E_df = read_label_file('cpc-titles/cpc-section-E_20220201.txt')
F_df = read_label_file('cpc-titles/cpc-section-F_20220201.txt')
G_df = read_label_file('cpc-titles/cpc-section-G_20220201.txt')
H_df = read_label_file('cpc-titles/cpc-section-H_20220201.txt')

In [14]:
for pre, fill, node in RenderTree(build_tree(F_df)):
    print("%s%s" % (pre, node.name))

MECHANICAL ENGINEERING; LIGHTING; HEATING; WEAPONS; BLASTING
├── MACHINES OR ENGINES IN GENERAL; ENGINE PLANTS IN GENERAL; STEAM ENGINES
│   ├── MACHINES OR ENGINES, IN GENERAL OR OF POSITIVE-DISPLACEMENT TYPE, e.g. STEAM ENGINES
│   ├── ROTARY-PISTON OR OSCILLATING-PISTON MACHINES OR ENGINES
│   ├── NON-POSITIVE DISPLACEMENT MACHINES OR ENGINES, e.g. STEAM TURBINES
│   ├── STEAM ENGINE PLANTS; STEAM ACCUMULATORS; ENGINE PLANTS NOT OTHERWISE PROVIDED FOR; ENGINES USING SPECIAL WORKING FLUIDS OR CYCLES
│   ├── CYCLICALLY OPERATING VALVES FOR MACHINES OR ENGINES
│   ├── LUBRICATING OF MACHINES OR ENGINES IN GENERAL; LUBRICATING INTERNAL COMBUSTION ENGINES; CRANKCASE VENTILATING
│   ├── GAS-FLOW SILENCERS OR EXHAUST APPARATUS FOR MACHINES OR ENGINES IN GENERAL; GAS-FLOW SILENCERS OR EXHAUST APPARATUS FOR INTERNAL COMBUSTION ENGINES
│   └── COOLING OF MACHINES OR ENGINES IN GENERAL; COOLING OF INTERNAL-COMBUSTION ENGINES
├── COMBUSTION ENGINES; HOT-GAS OR COMBUSTION-PRODUCT ENGINE PLANTS
│

## Parsing rules

In [None]:
# PHYSICS
# ├── MEASURING; TESTING
# │   ├── MEASURING LENGTH, THICKNESS OR SIMILAR LINEAR DIMENSIONS; MEASURING ANGLES; MEASURING AREAS; MEASURING IRREGULARITIES OF SURFACES OR CONTOURS

### split titles by semicolon

NOTE: here, we prefer not to split the content by "OR", because all the parts connected by OR eventually form an overall concept or a plausible event.

### Examples with e.g.

In [None]:
# ├── PHOTOGRAPHY; CINEMATOGRAPHY; ANALOGOUS TECHNIQUES USING WAVES OTHER THAN OPTICAL WAVES; ELECTROGRAPHY; HOLOGRAPHY
# │   ├── APPARATUS OR ARRANGEMENTS FOR TAKING PHOTOGRAPHS OR FOR PROJECTING OR VIEWING THEM; APPARATUS OR ARRANGEMENTS EMPLOYING ANALOGOUS TECHNIQUES USING WAVES OTHER THAN OPTICAL WAVES; ACCESSORIES THEREFOR
# │   ├── PHOTOMECHANICAL PRODUCTION OF TEXTURED OR PATTERNED SURFACES, e.g. FOR PRINTING, FOR PROCESSING OF SEMICONDUCTOR DEVICES; MATERIALS THEREFOR; ORIGINALS THEREFOR; APPARATUS SPECIALLY ADAPTED THEREFOR;

In [11]:
# ├── MEASURING; TESTING
# │   ├── SCANNING-PROBE TECHNIQUES OR APPARATUS; APPLICATIONS OF SCANNING-PROBE TECHNIQUES, e.g. SCANNING PROBE MICROSCOPY [SPM]

In [None]:
# ├── MUSICAL INSTRUMENTS; ACOUSTICS
# │   ├── REPRESENTATION OF MUSIC; RECORDING MUSIC IN NOTATION FORM; ACCESSORIES FOR MUSIC OR MUSICAL INSTRUMENTS NOT OTHERWISE PROVIDED FOR, e.g. SUPPORTS

1. e.g. + examples starts with preposition ==> add the e.g. content after the main title as hyponym nodes ????
2. The contents of [] are usually abbreviations 

NOTE: e.g. is dedicated to the closest one as a supplement (with; split).

### Remove trash titles

NOTE: titles which has content of SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION

In [None]:
# ├── NUCLEAR PHYSICS; NUCLEAR ENGINEERING
# │   └── TECHNIQUES FOR HANDLING PARTICLES OR IONISING RADIATION NOT OTHERWISE PROVIDED FOR; IRRADIATION DEVICES; GAMMA RAY OR X-RAY MICROSCOPES
# └── SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION

## problems to solve:

In [None]:
# ├── CONTROLLING; REGULATING
# │   ├── CONTROL OR REGULATING SYSTEMS IN GENERAL; FUNCTIONAL ELEMENTS OF SUCH SYSTEMS; MONITORING OR TESTING ARRANGEMENTS FOR SUCH SYSTEMS OR ELEMENTS

Some triky key words like: such

In [12]:
# ├── EDUCATION; CRYPTOGRAPHY; DISPLAY; ADVERTISING; SEALS
# │   ├── DISPLAYING; ADVERTISING; SIGNS; LABELS OR NAME-PLATES; SEALS

repetited words in title-hypernym

SEALS in the previous example

In [None]:
# ├── ENGINEERING ELEMENTS AND UNITS; GENERAL MEASURES FOR PRODUCING AND MAINTAINING EFFECTIVE FUNCTIONING OF MACHINES OR INSTALLATIONS; THERMAL INSULATION IN GENERAL
# │   ├── DEVICES FOR FASTENING OR SECURING CONSTRUCTIONAL ELEMENTS OR MACHINE PARTS TOGETHER, e.g. NAILS, BOLTS, CIRCLIPS, CLAMPS, CLIPS, WEDGES, JOINTS OR JOINTING

In [None]:
# │   ├── INDEXING SCHEME ASSOCIATED WITH SUBCLASSES F21K, F21L, F21S and F21V, RELATING TO USES OR APPLICATIONS OF LIGHTING DEVICES OR SYSTEMS

In [None]:
# ├── WEAPONS
# │   ├── FUNCTIONAL FEATURES OR DETAILS COMMON TO BOTH SMALLARMS AND ORDNANCE, e.g. CANNONS; MOUNTINGS FOR SMALLARMS OR ORDNANCE
# │   ├── APPARATUS FOR LAUNCHING PROJECTILES OR MISSILES FROM BARRELS, e.g. CANNONS