In [1]:
import os
from tqdm import tqdm
import warnings, transformers, logging, torch
import pandas as pd
import numpy as np
import shutil
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [3]:
os.environ["WANDB_DISABLED"] = "true"
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [4]:
class CFG:
    if iskaggle:
        input_path = '../input/us-patent-phrase-to-phrase-matching'
    else:
        input_path = '/home/bhavik/projects/kaggle-patent-phrase-matching/data'
    
    model_path = 'anferico/bert-for-patents'
    # model_path = 'microsoft/deberta-v3-base'
    
    learning_rate = 1e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 6
    batch_size = 16

    max_len=133

In [5]:
train_df = pd.read_csv(f"{CFG.input_path}/train.csv")
titles = pd.read_csv(f"{CFG.input_path}/titles.csv")
train_df = train_df.merge(titles, left_on='context', right_on='code')

In [6]:
train_df.head(25)

Unnamed: 0,id,anchor,target,context,score,code,title,section,class,subclass,group,main_group
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
5,067203128142739c,abatement,greenhouse gases,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
6,061d17f04be2d1cf,abatement,increased rate,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
7,e1f44e48399a2027,abatement,measurement level,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
8,0a425937a3e86d10,abatement,minimising sounds,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
9,ef2d4c2e6bbb208d,abatement,mixing core materials,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,


In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    CFG.model_path,
    additional_special_tokens = ['[abstract]','[claim]','[summary]','[invention]','[cpc]']
)

In [8]:
# train_df['input'] = train_df.anchor + '[cpc]' + train_df.title + '[SEP]' + train_df.target  # bert patents
train_df['input'] = train_df.anchor + '[SEP]' + train_df.target + '[cpc]' + train_df.title  # bert patents (best val score so far)
# train_df['input'] = train_df['title'] + ' ' + train_df['anchor']  # deberta
# train_df['input'] = train_df.title + '[SEP]' + train_df.anchor + '[SEP]' + train_df.target # deberta

Bert for patents has special tokens that can be used to map the input to special parts of a patent. Could map to following input structures:

[CLS]anchor[cpc]context_text[SEP]target[SEP]

or

[CLS]anchor[SEP]target[cpc]context_text[SEP]

In [9]:
# ====================================================
# Define max_len
# ====================================================
lenghts = []
for input_text in train_df['input']:
    lenghts.append(len(tokenizer(input_text, add_special_tokens=False)['input_ids']))
    
max_len = max(lenghts) + 4 # CLS + SEP + SEP + SEP
CFG.max_len = max_len
max_len

70

In [10]:
anchors = train_df.anchor.unique()
np.random.seed(42)
np.random.shuffle(anchors)
anchors[:5]

array(['respective emitter', 'dental aligners', 'committee',
       'form trench isolation', 'pulping apparatus'], dtype=object)

In [11]:
val_prop = 0.25
val_sz = int(len(anchors)*val_prop)
val_anchors = anchors[:val_sz]

In [12]:
is_val = np.isin(train_df.anchor, val_anchors)
idxs = np.arange(len(train_df))
val_idxs = idxs[ is_val]
trn_idxs = idxs[~is_val]
len(val_idxs), len(trn_idxs)

(9418, 27055)

In [13]:
val_df = train_df[is_val]
len(val_df)

9418

In [14]:
training_df = train_df[~is_val]
len(training_df)

27055

In [15]:
training_df.score.mean(), val_df.score.mean()

(0.35914803178710036, 0.37043427479294966)

In [16]:
def corr(eval_pred): 
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [17]:
encoded = tokenizer(training_df.iloc[0]['input'])

In [18]:
tokenizer.convert_ids_to_tokens(encoded['input_ids'])

['[CLS]',
 'abatement',
 '[SEP]',
 'abatement',
 'of',
 'pollution',
 '[cpc]',
 'furniture',
 ';',
 'domestic',
 'articles',
 'or',
 'appliances',
 ';',
 'coffee',
 'mills',
 ';',
 'spice',
 'mills',
 ';',
 'suction',
 'cleaners',
 'in',
 'general',
 '[SEP]']

In [19]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer(inputs, max_length=CFG.max_len, padding="max_length"),
        #**tokenizer(inputs),  # use for bert-patents
        # **tokenizer(inputs, targets),
        'label': label.astype(np.float32)
    }

In [20]:
train_dataset = TrainDataset(training_df)
val_dataset = TrainDataset(val_df)

In [21]:
args = TrainingArguments(
    'outputs',
    save_strategy='no',
    learning_rate=CFG.learning_rate, 
    warmup_ratio=0.1, 
    lr_scheduler_type='cosine', 
    fp16=True,
    evaluation_strategy="epoch", 
    per_device_train_batch_size=CFG.batch_size, 
    per_device_eval_batch_size=CFG.batch_size*2,
    num_train_epochs=CFG.epochs, 
    weight_decay=CFG.weight_decay, 
    report_to='none'
)
model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
model_trainer = Trainer(
    model, 
    args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset,           
    tokenizer=tokenizer, 
    compute_metrics=corr,
)

In [22]:
model_trainer.train()

  5%|▍         | 501/10146 [01:33<30:11,  5.32it/s]

{'loss': 0.1195, 'learning_rate': 4.9064039408867e-06, 'epoch': 0.3}


 10%|▉         | 1001/10146 [03:11<29:32,  5.16it/s]

{'loss': 0.0624, 'learning_rate': 9.832512315270937e-06, 'epoch': 0.59}


 15%|█▍        | 1501/10146 [04:47<27:40,  5.21it/s]

{'loss': 0.0439, 'learning_rate': 9.931119284220583e-06, 'epoch': 0.89}


                                                    
 17%|█▋        | 1691/10146 [05:45<29:27,  4.78it/s]

{'eval_loss': 0.029614370316267014, 'eval_pearson': 0.7806832944389764, 'eval_runtime': 21.4173, 'eval_samples_per_second': 439.737, 'eval_steps_per_second': 13.774, 'epoch': 1.0}


 20%|█▉        | 2001/10146 [06:46<26:45,  5.07it/s]   

{'loss': 0.0358, 'learning_rate': 9.716752288261592e-06, 'epoch': 1.18}


 25%|██▍       | 2500/10146 [08:25<25:47,  4.94it/s]

{'loss': 0.0311, 'learning_rate': 9.363141824089246e-06, 'epoch': 1.48}


 30%|██▉       | 3000/10146 [10:06<24:08,  4.93it/s]

{'loss': 0.0284, 'learning_rate': 8.880726843002818e-06, 'epoch': 1.77}


                                                    
 33%|███▎      | 3382/10146 [11:45<24:25,  4.61it/s]

{'eval_loss': 0.026231344789266586, 'eval_pearson': 0.8110260766375992, 'eval_runtime': 22.3184, 'eval_samples_per_second': 421.983, 'eval_steps_per_second': 13.218, 'epoch': 2.0}


 35%|███▍      | 3501/10146 [12:09<21:56,  5.05it/s]   

{'loss': 0.0252, 'learning_rate': 8.283748740559948e-06, 'epoch': 2.07}


 39%|███▉      | 4001/10146 [13:49<20:33,  4.98it/s]

{'loss': 0.0193, 'learning_rate': 7.5898309356623355e-06, 'epoch': 2.37}


 44%|████▍     | 4501/10146 [15:28<18:50,  4.99it/s]

{'loss': 0.0191, 'learning_rate': 6.8194586087697245e-06, 'epoch': 2.66}


 49%|████▉     | 5001/10146 [17:06<16:42,  5.13it/s]

{'loss': 0.0181, 'learning_rate': 5.995373957915085e-06, 'epoch': 2.96}


                                                    
 50%|█████     | 5073/10146 [17:42<17:44,  4.76it/s]

{'eval_loss': 0.022934822365641594, 'eval_pearson': 0.8225436328952992, 'eval_runtime': 21.4159, 'eval_samples_per_second': 439.766, 'eval_steps_per_second': 13.775, 'epoch': 3.0}


 54%|█████▍    | 5501/10146 [19:07<15:39,  4.94it/s]  

{'loss': 0.0147, 'learning_rate': 5.141904825196881e-06, 'epoch': 3.25}


 59%|█████▉    | 6000/10146 [20:47<14:05,  4.90it/s]

{'loss': 0.0118, 'learning_rate': 4.284246513397596e-06, 'epoch': 3.55}


 64%|██████▍   | 6501/10146 [22:25<11:51,  5.12it/s]

{'loss': 0.0123, 'learning_rate': 3.447717994254234e-06, 'epoch': 3.84}


                                                    
 67%|██████▋   | 6764/10146 [23:37<11:31,  4.89it/s]

{'eval_loss': 0.023637427017092705, 'eval_pearson': 0.826451532728362, 'eval_runtime': 21.3588, 'eval_samples_per_second': 440.943, 'eval_steps_per_second': 13.812, 'epoch': 4.0}


 69%|██████▉   | 7001/10146 [24:23<10:12,  5.13it/s]  

{'loss': 0.0108, 'learning_rate': 2.6570144658917406e-06, 'epoch': 4.14}


 74%|███████▍  | 7501/10146 [26:00<08:29,  5.19it/s]

{'loss': 0.0089, 'learning_rate': 1.9354783247068317e-06, 'epoch': 4.44}


 79%|███████▉  | 8001/10146 [27:38<06:54,  5.18it/s]

{'loss': 0.0085, 'learning_rate': 1.3044100733758757e-06, 'epoch': 4.73}


 83%|████████▎ | 8455/10146 [29:06<05:44,  4.91it/s]
 83%|████████▎ | 8455/10146 [29:28<05:44,  4.91it/s]

{'eval_loss': 0.02324233390390873, 'eval_pearson': 0.823762615859203, 'eval_runtime': 21.3773, 'eval_samples_per_second': 440.561, 'eval_steps_per_second': 13.8, 'epoch': 5.0}


 84%|████████▍ | 8501/10146 [29:37<05:18,  5.17it/s]  

{'loss': 0.0084, 'learning_rate': 7.824395077042063e-07, 'epoch': 5.03}


 89%|████████▊ | 9001/10146 [31:14<03:47,  5.04it/s]

{'loss': 0.0072, 'learning_rate': 3.8497574553977293e-07, 'epoch': 5.32}


 94%|█████████▎| 9501/10146 [32:51<02:06,  5.08it/s]

{'loss': 0.007, 'learning_rate': 1.2375233347365622e-07, 'epoch': 5.62}


 99%|█████████▊| 10001/10146 [34:29<00:28,  5.13it/s]

{'loss': 0.0069, 'learning_rate': 6.568716076514325e-09, 'epoch': 5.91}


100%|██████████| 10146/10146 [34:57<00:00,  4.81it/s]
100%|██████████| 10146/10146 [35:19<00:00,  4.79it/s]

{'eval_loss': 0.023296654224395752, 'eval_pearson': 0.8235458416009239, 'eval_runtime': 21.6038, 'eval_samples_per_second': 435.941, 'eval_steps_per_second': 13.655, 'epoch': 6.0}
{'train_runtime': 2119.3048, 'train_samples_per_second': 76.596, 'train_steps_per_second': 4.787, 'train_loss': 0.024693631124806645, 'epoch': 6.0}





TrainOutput(global_step=10146, training_loss=0.024693631124806645, metrics={'train_runtime': 2119.3048, 'train_samples_per_second': 76.596, 'train_steps_per_second': 4.787, 'train_loss': 0.024693631124806645, 'epoch': 6.0})

In [23]:
shutil.rmtree('outputs')