This notebook aims at exploring all the possibilities for parsing CPC titles into title-hyponyms structure.

# Data preprocessing

In [2]:
from anytree import Node, RenderTree
import re 
import pandas as pd

In [11]:
TARGET_LEVEL = 6

global dict_lvl
dict_lvl = {1: -3, 3: -2, 4: -1, 6: 0}

def convert_to_int(x):
    lvl = x['lvl']
    try:
        return int(lvl)
    except ValueError:
        title = x['title']
        if len(title) == 1:
            return -3
        elif len(title) == 3:
            return -2
        elif len(title) == 4:
            return -1

def clean_descr(description):
    """
    1. remove references from the description, such as *** (preserving A23B; obtaining protein compositions for foodstuffs A23J1/00;)
    2. remove '{' and '}'
    3. remove i.e. ***
    """
    description = description.replace('{', '').replace('}','')
    description = re.sub(r'\ ?\([\w\W]*([A-Z]{1}[0-9]{2}[A-Z]{1}[0-9]*[\/]*[0-9]*)*[\w\W]*\)', '', description)
    description = re.sub(r'[ ,]*i\.e\..*', '', description)
    return description.lower()

def has_cpc(description):
    """
    4. check whether a title desciption has CPC code inside
    e.g. having alternatively specified atoms bound to the phosphorus atom and not covered by a single one of groups A01N57/10, A01N57/18, A01N57/26, A01N57/34
    """
    cpc_pattern = r'[A-Za-z]{1}[0-9]{2}[A-Za-z]{0,1}[0-9]*[\/]*[0-9]*'
    match = re.search(cpc_pattern, description)
    if match:
        return True
    else:
        return False

def is_trash_title(description):
    """
    5. check whether a title description is a trash title
    """
    description = description.lower()
    if ("subject matter not" in description and "provided for in" in description) or ("covered by" in description and (" subclass " in description or " groups " in description) and " other " in description) or ("dummy group" in description):
        return True
    else:
        return False

def next_same_lvl_index(subdf,lvl):
    lvl_df = subdf[subdf['lvl'] == lvl]
    idx_df = lvl_df.index
    if len(idx_df) == 1:
        return idx_df[0]

    res_index = idx_df[1] - 1
    return res_index

def rm_title_with_subtree(dataframe):
    """
    6. remove those titles with codes in their description and its subtree titles
    """
    # get indices of description 
    dataframe = dataframe.reset_index(drop=True)
    id_hascpc = dataframe[dataframe['description'].apply(has_cpc) | dataframe['description'].apply(is_trash_title)].index
    
    idx_to_drop = []
    for i in id_hascpc:
        l = dataframe.iloc[i]
        if i == (dataframe.shape[0]-1) or l['lvl'] >= dataframe.iloc[i+1]['lvl']: 
            idx_to_drop.append(i)
        else:
            j = next_same_lvl_index(dataframe[i:], l['lvl'])
            if i == j:
                idx_to_drop.append(i)
            elif j>i:
                idx_to_drop.extend(range(i,j+1))
            else:
                raise ValueError("Problem with subtree index!")

    dataframe = dataframe.drop(idx_to_drop)
    return dataframe.reset_index(drop=True)

def read_label_file(file_name, max_level=TARGET_LEVEL):
    df = pd.read_csv(file_name, header=None, sep='\t', dtype=object, names=['title', 'lvl', 'description'])
    df['lvl'] = df.apply(convert_to_int, axis=1)

    if max_level in [1,3,4,6]:
        df = df[df['lvl']<=dict_lvl[max_level]]

    df['description'] = df['description'].apply(clean_descr)
    df = rm_title_with_subtree(df)
    return df.dropna().reset_index(drop=True)

def find_father(df, child_lvl):
    for i, row in df[::-1].iterrows():
        if row.lvl + 1 == child_lvl:
            return row.title

def build_tree(df):
    node_dict = {}
    root_title = df.loc[0, 'title']
    root_desc = df.loc[0, 'description']
    node_dict[root_title] = Node(root_desc)

    for i, row in df[1:].iterrows():
        child_lvl = row.lvl
        child_title = row.title
        child_desc = row.description

        father_title = find_father(df[:i], child_lvl)
        node_dict[child_title] = Node(child_desc, parent = node_dict[father_title])
    return node_dict[root_title]

In [12]:
A_df = read_label_file('cpc-titles/cpc-section-A_20220201.txt')
B_df = read_label_file('cpc-titles/cpc-section-B_20220201.txt')
C_df = read_label_file('cpc-titles/cpc-section-C_20220201.txt')
D_df = read_label_file('cpc-titles/cpc-section-D_20220201.txt')
E_df = read_label_file('cpc-titles/cpc-section-E_20220201.txt')
F_df = read_label_file('cpc-titles/cpc-section-F_20220201.txt')
G_df = read_label_file('cpc-titles/cpc-section-G_20220201.txt')
H_df = read_label_file('cpc-titles/cpc-section-H_20220201.txt')

In [13]:
A_df

Unnamed: 0,title,lvl,description
0,A,-3,human necessities
1,A01,-2,agriculture; forestry; animal husbandry; hunti...
2,A01B,-1,soil working in agriculture or forestry; parts...
3,A01B1/00,0,hand tools
4,A01B3/00,0,ploughs with fixed plough-shares
...,...,...,...
1274,A63J21/00,0,conjuring appliances; auxiliary apparatus for ...
1275,A63J25/00,0,equipment specially adapted for cinemas
1276,A63K,-1,racing; riding sports; equipment or accessorie...
1277,A63K1/00,0,race-courses; race-tracks


In [14]:
for pre, fill, node in RenderTree(build_tree(A_df)):
    print("%s%s" % (pre, node.name))

human necessities
├── agriculture; forestry; animal husbandry; hunting; trapping; fishing
│   ├── soil working in agriculture or forestry; parts, details, or accessories of agricultural machines or implements, in general
│   │   ├── hand tools
│   │   ├── ploughs with fixed plough-shares
│   │   ├── ploughs with rolling non-driven tools, e.g. discs
│   │   ├── disc-like soil-working implements usable either as ploughs or as harrows, or the like
│   │   ├── ploughs with rotary driven tools
│   │   ├── ploughs with oscillating, digging or piercing tools driven or not
│   │   ├── ploughs or like machines for special purposes
│   │   ├── elements, tools, or details of ploughs
│   │   ├── ploughs with special additional arrangements, e.g. means for putting manure under the soil, clod-crushers; means for breaking the subsoil
│   │   ├── harrows with non-rotating tools
│   │   ├── harrows with rotary non-driven tools
│   │   ├── elements, tools, or details of harrows
│   │   ├── harrows with 

## Parsing rules

### 1. split titles by semicolon [x]

NOTE: here, we prefer not to split the content by "OR", because all the parts connected by OR eventually form an overall concept or a plausible event.

In [57]:
# PHYSICS
# ├── MEASURING; TESTING
# │   ├── MEASURING LENGTH, THICKNESS OR SIMILAR LINEAR DIMENSIONS; MEASURING ANGLES; MEASURING AREAS; MEASURING IRREGULARITIES OF SURFACES OR CONTOURS

TODO 如果不用or分裂，检索问题？
fasttext

### 2. split titles with examples (with e.g.) [x]

In [58]:
# ├── PHOTOGRAPHY; CINEMATOGRAPHY; ANALOGOUS TECHNIQUES USING WAVES OTHER THAN OPTICAL WAVES; ELECTROGRAPHY; HOLOGRAPHY
# │   ├── APPARATUS OR ARRANGEMENTS FOR TAKING PHOTOGRAPHS OR FOR PROJECTING OR VIEWING THEM; APPARATUS OR ARRANGEMENTS EMPLOYING ANALOGOUS TECHNIQUES USING WAVES OTHER THAN OPTICAL WAVES; ACCESSORIES THEREFOR
# │   ├── PHOTOMECHANICAL PRODUCTION OF TEXTURED OR PATTERNED SURFACES, e.g. FOR PRINTING, FOR PROCESSING OF SEMICONDUCTOR DEVICES; MATERIALS THEREFOR; ORIGINALS THEREFOR; APPARATUS SPECIALLY ADAPTED THEREFOR;

# ├── MUSICAL INSTRUMENTS; ACOUSTICS
# │   ├── REPRESENTATION OF MUSIC; RECORDING MUSIC IN NOTATION FORM; ACCESSORIES FOR MUSIC OR MUSICAL INSTRUMENTS, e.g. SUPPORTS

In [1]:
# ├── MEASURING; TESTING
# │   ├── SCANNING-PROBE TECHNIQUES OR APPARATUS; APPLICATIONS OF SCANNING-PROBE TECHNIQUES, e.g. SCANNING PROBE MICROSCOPY [SPM]

# 缩写是否需要新建一列变量？

e.g. + examples starts with preposition (for) ==> add the e.g. content after the main title as hyponym nodes / v.ed   

    NOTE: e.g. is dedicated to the closest one as a supplement (with; split).


Construction or design features not otherwise provided for; manufacturing or production;

remove titles with "not provided for" inside

In [70]:
# │   │   │   │   └── Component parts, details, e.g. sealings, lubrication
# │   │   │   │       ├── Cylinders
# │   │   │   │       ├── Actuating or actuated elements
# │   │   │   │       │   └── Actuating or actuated element bearing means or driving or driven axis bearing means

"sealings, lubrication" will be at the same level of Cylinders

### 3. with "such" in title [x]

In [None]:
# │       ├── patterns for cutting-out; methods of drafting or marking-out such patterns

In [62]:
# ├── CONTROLLING; REGULATING
# │   ├── CONTROL OR REGULATING SYSTEMS IN GENERAL; FUNCTIONAL ELEMENTS OF SUCH SYSTEMS; MONITORING OR TESTING ARRANGEMENTS FOR SUCH SYSTEMS OR ELEMENTS

remove "in general" at the end of title

In [None]:
    # │   ├── exercising apparatus combining several parts such as ladders, rods, beams, slides

keep titles with "such as" "such a", also with those the noun has already appeared before

In [None]:
# │   │   │   ├── filters formed by clamping together several filtering elements or parts of such elements

### 4. children nodes should not contain in parent node [x]

In [63]:
# ├── EDUCATION; CRYPTOGRAPHY; DISPLAY; ADVERTISING; SEALS
# │   ├── DISPLAYING; ADVERTISING; SIGNS; LABELS OR NAME-PLATES; SEALS

repetited words in title-hypernym

SEALS in the previous example

### 5. if title starts with preposition ==> connect with parent node [x]

In [69]:
# │   │   │   │   └── Controlling
# │   │   │   │       ├── by using a valve in a system with several pump or motor chambers, wherein the flow path through the chambers can be changed, e.g. series-parallel
# │   │   │   │       ├── by changing the effective cross sectional piston working surface
# │   │   │   │       ├── by changing the effective piston stroke
# │   │   │   │       │   └── by changing the excentricity of one element relative to another element

TODO: 

1. schema de database

2. merge with wikipedia 