In [None]:
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [None]:
if iskaggle:
    !pip install -q datasets

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import random
import os
import torch
from sklearn.model_selection import KFold, StratifiedKFold
import shutil
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup, TrainingArguments, Trainer, AutoModelForSequenceClassification
from datasets import load_metric
import datasets
from transformers import BertModel
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
import warnings, transformers, logging, torch

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [None]:
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [None]:
class CFG:
    if iskaggle:
        input_path = '../input/us-patent-phrase-to-phrase-matching'
    else:
        input_path = '/home/bhavik/projects/kaggle-patent-phrase-matching/data'
    model_path = 'anferico/bert-for-patents'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 16

In [None]:
train_df = pd.read_csv(f"{CFG.input_path}/train.csv")
if iskaggle:
    titles = pd.read_csv(f"../input/us-patents-category-titles/titles.csv")
else:
    titles = pd.read_csv(f"{CFG.input_path}/titles.csv")
train_df = train_df.merge(titles, left_on='context', right_on='code')

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    # data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
train_df['input'] = train_df['title']+' '+train_df['anchor']
train_df = create_folds(train_df, CFG.num_fold)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer( inputs, targets ),
        'label':label.astype(np.float32)
    }

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [None]:
test_df = pd.read_csv(f"{CFG.input_path}/test.csv")
test_df = test_df.merge(titles, left_on='context', right_on='code')

test_df['input'] = test_df['title']+' '+test_df['anchor']

# tokenizer = AutoTokenizer.from_pretrained('uspppm_0')

class InferDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        return {
        **tokenizer( inputs, targets )
    }

test_predictions = []

# for fold in range(CFG.num_fold):

In [None]:
oof_df = pd.DataFrame()
for fold in range(CFG.num_fold):
    
    tr_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
    va_data = train_df[train_df['fold']==fold].reset_index(drop=True)
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)

    if iskaggle:
        train_output_dir = f"/kaggle/working/tmp/uspppm"
    else:
        train_output_dir = f"/tmp/uspppm"
    
    args = TrainingArguments(
        output_dir=train_output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model="pearson",
        load_best_model_at_end=True,
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    shutil.rmtree(train_output_dir)
    # trainer.save_model(f"uspppm_{fold}")

    test_dataset = InferDataset(test_df)
    test_outputs = trainer.predict(test_dataset)
    test_prediction = outputs.predictions.reshape(-1)
    test_predictions.append(test_prediction)
    
    outputs = trainer.predict(va_dataset)
    val_predictions = outputs.predictions.reshape(-1)
    va_data['preds'] = val_predictions
    oof_df = pd.concat([oof_df, va_data])

In [None]:
predictions = oof_df['preds'].values
label = oof_df['score'].values
eval_pred = predictions, label
compute_metrics(eval_pred)

In [None]:
oof_df.to_csv('oof_df.csv', index=False)

### Inference

In [None]:
test_predictions = np.mean(test_predictions, axis=0)
submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': test_predictions,
})

submission.to_csv('submission.csv', index=False)