In [None]:
import os
from tqdm import tqdm
import warnings, transformers, logging, torch
import pandas as pd
import numpy as np
import shutil
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification

In [None]:
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [None]:
os.environ["WANDB_DISABLED"] = "true"
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [None]:
class CFG:
    if iskaggle:
        input_path = '../input/us-patent-phrase-to-phrase-matching'
    else:
        input_path = '/home/bhavik/projects/kaggle-patent-phrase-matching/data'
    
    # model_path = 'anferico/bert-for-patents'
    model_path = 'microsoft/deberta-v3-large'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 16

    max_len=133

In [None]:
train_df = pd.read_csv(f"{CFG.input_path}/train.csv")
titles = pd.read_csv(f"{CFG.input_path}/titles.csv")
train_df = train_df.merge(titles, left_on='context', right_on='code')

In [None]:
train_df.head(25)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    CFG.model_path,
    # additional_special_tokens = ['[abstract]','[claim]','[summary]','[invention]','[cpc]']
)

In [None]:
# train_df['input'] = train_df.anchor + '[cpc]' + train_df.title + '[SEP]' + train_df.target  # bert patents
# train_df['input'] = train_df.anchor + '[SEP]' + train_df.target + '[cpc]' + train_df.title  # bert patents (best val score so far)
# train_df['input'] = train_df['title'] + ' ' + train_df['anchor']  # deberta
train_df['input'] = train_df.title + '[SEP]' + train_df.anchor + '[SEP]' + train_df.target # deberta

Bert for patents has special tokens that can be used to map the input to special parts of a patent. Could map to following input structures:

[CLS]anchor[cpc]context_text[SEP]target[SEP]

or

[CLS]anchor[SEP]target[cpc]context_text[SEP]

In [None]:
# ====================================================
# Define max_len
# ====================================================
lenghts = []
for input_text in train_df['input']:
    lenghts.append(len(tokenizer(input_text, add_special_tokens=False)['input_ids']))
    
max_len = max(lenghts) + 4 # CLS + SEP + SEP + SEP
CFG.max_len = max_len
max_len

In [None]:
anchors = train_df.anchor.unique()
np.random.seed(42)
np.random.shuffle(anchors)
anchors[:5]

In [None]:
val_prop = 0.25
val_sz = int(len(anchors)*val_prop)
val_anchors = anchors[:val_sz]

In [None]:
is_val = np.isin(train_df.anchor, val_anchors)
idxs = np.arange(len(train_df))
val_idxs = idxs[ is_val]
trn_idxs = idxs[~is_val]
len(val_idxs), len(trn_idxs)

In [None]:
val_df = train_df[is_val]
len(val_df)

In [None]:
training_df = train_df[~is_val]
len(training_df)

In [None]:
training_df.score.mean(), val_df.score.mean()

In [None]:
def corr(eval_pred): 
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
encoded = tokenizer(training_df.iloc[0]['input'])

In [None]:
tokenizer.convert_ids_to_tokens(encoded['input_ids'])

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer(inputs, max_length=CFG.max_len, padding="max_length"),
        #**tokenizer(inputs),  # use for bert-patents
        # **tokenizer(inputs, targets),
        'label': label.astype(np.float32)
    }

In [None]:
train_dataset = TrainDataset(training_df)
val_dataset = TrainDataset(val_df)

In [None]:
args = TrainingArguments(
    'outputs',
    save_strategy='no',
    learning_rate=CFG.learning_rate, 
    warmup_ratio=0.1, 
    lr_scheduler_type='cosine', 
    fp16=True,
    evaluation_strategy="epoch", 
    per_device_train_batch_size=CFG.batch_size, 
    per_device_eval_batch_size=CFG.batch_size*2,
    num_train_epochs=CFG.epochs, 
    weight_decay=CFG.weight_decay, 
    report_to='none'
)
model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
model_trainer = Trainer(
    model, 
    args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset,           
    tokenizer=tokenizer, 
    compute_metrics=corr,
)

In [None]:
model_trainer.train()

In [None]:
shutil.rmtree('outputs')