# Demo Notebook on U.S. Patent Phrase-to-Phrase Matching 

## Configuration

### Directories

In [1]:
INPUT_DIR = "INPUT_data/"
CPC_DIR = "CPC_data/"
OUTPUT_DIR = './'

### Actual configs 

In [2]:
class CFG:
#    wandb=True
#    competition='PPPM'c
#   _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model= "sentence-transformers/bert-base-nli-mean-tokens" # "microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

## Libraries

General Libraries

In [3]:
import os
import re
import time
import math
import pickle
import random
import warnings
warnings.filterwarnings("ignore")

Scientific Libraries

In [4]:
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
#import torch

## Data Loading

### Input Dataset

In [5]:
# ====================================================
# Data Loading
# ====================================================

train = pd.read_csv(INPUT_DIR + 'train.csv')
print(f"train.shape: {train.shape}")
display(train.head())

train.shape: (36473, 5)


Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


### CPC Data

In [6]:
# ====================================================
# CPC (Cooperative Patent Classification) Data
# ====================================================

def get_cpc_texts(file_path = None, write = False):
    if file_path and write == False :
        return pd.read_csv(file_path, header = None, index_col = 0, squeeze = True).to_dict()

    else : 
        contexts = []
        pattern = '[A-Z]\d+'
        for file_name in os.listdir(CPC_DIR + 'CPCSchemeXML202208'):
            result = re.findall(pattern, file_name)
            if result:
                contexts.append(result)
        contexts = sorted(set(sum(contexts, [])))
        results = {}
        for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
            with open(f'CPC_data/CPCTitleList202208/cpc-section-{cpc}_20220801.txt') as f:
                s = f.read()
            pattern = f'{cpc}\t\t.+'
            result = re.findall(pattern, s)
            cpc_result = result[0].lstrip(pattern)
            for context in [c for c in contexts if c[0] == cpc]:
                pattern = f'{context}\t\t.+'
                result = re.findall(pattern, s)
                results[context] = cpc_result + ". " + result[0].lstrip(pattern)

        if write :
            temp = pd.DataFrame.from_dict(cpc_texts, orient='index')
            temp.to_csv(file_path, index = True, header = False)

        return results

cpc_texts = get_cpc_texts()

train['context_text'] = train['context'].map(cpc_texts).apply(lambda x:x.lower())
display(train.head())

Unnamed: 0,id,anchor,target,context,score,context_text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,human necessities. furniture; domestic article...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,human necessities. furniture; domestic article...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,human necessities. furniture; domestic article...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,human necessities. furniture; domestic article...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,human necessities. furniture; domestic article...


### Additional Features: Text

In [7]:
train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['context_text']

### Additional Features: Unique Targets for same Anchor

In [8]:
# targets_from_same_anchor -> List of unique targets for same anchor
extra = train.groupby('anchor').target.agg(list).rename('targets_for_same_anchor') 
train = train.join(extra, on='anchor')

# diff_targets_same_anchor -> List of unique targets for same anchor (excluding the current row's target)
train['diff_targets_same_anchor'] = train.apply(lambda x:[i for i in x['targets_for_same_anchor'] if i != x['target']], axis=1) 

# Remove duplicates from diff_targets_from_same_anchor
train['diff_targets_same_anchor'] = train['diff_targets_from_same_anchor'].apply(lambda x:', '.join(sorted(list(set(x)), key=x.index))) 

# Drop list of unique targets for same anchor
train = train.drop(["targets_from_same_anchor"], inplace=True)
train.drop(columns="targets_from_same_anchor", inplace=True)

### Additional Features: List of targets for the same anchor and context

In [9]:
# same_anchor_context -> List of targets for the same anchor and context
train = train.join(train.groupby(['anchor', 'context']).target.agg(list).rename('same_anchor_context'), on=['anchor', 'context'])
train['same_anchor_context'] = train.apply(lambda x: ', '.join([i for i in x['same_anchor_context'] if i != x['target']]), axis=1)

### Additional Features: List of unique anchors for the same context

In [10]:
# anchor_list -> List of unique anchors for the same context
train = train.join(train.groupby('context').anchor.agg('unique').rename('anchor_list'), on='context')
train['anchor_list'] = train.apply(lambda x:', '.join([i for i in x['anchor_list'] if i != x['anchor']]), axis=1)

In [17]:
train.head()

Unnamed: 0,id,anchor,target,context,score,context_text,text,diff_targets_from_same_anchor,same_anchor_context,anchor_list
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,human necessities. furniture; domestic article...,abatement[SEP]abatement of pollution[SEP]human...,"act of abating, active catalyst, eliminating p...","act of abating, active catalyst, eliminating p...","adhesive mounting, automatic coffee, carpet ti..."
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,human necessities. furniture; domestic article...,abatement[SEP]act of abating[SEP]human necessi...,"abatement of pollution, active catalyst, elimi...","abatement of pollution, active catalyst, elimi...","adhesive mounting, automatic coffee, carpet ti..."
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,human necessities. furniture; domestic article...,abatement[SEP]active catalyst[SEP]human necess...,"abatement of pollution, act of abating, elimin...","abatement of pollution, act of abating, elimin...","adhesive mounting, automatic coffee, carpet ti..."
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,human necessities. furniture; domestic article...,abatement[SEP]eliminating process[SEP]human ne...,"abatement of pollution, act of abating, active...","abatement of pollution, act of abating, active...","adhesive mounting, automatic coffee, carpet ti..."
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,human necessities. furniture; domestic article...,abatement[SEP]forest region[SEP]human necessit...,"abatement of pollution, act of abating, active...","abatement of pollution, act of abating, active...","adhesive mounting, automatic coffee, carpet ti..."
