In [2]:
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [None]:
#if iskaggle:
#    pip install datasets --find-links /kaggle/input/us-patent-phrase-to-phrase-matching/frozen_packages --no-index

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import random
import os
import torch
from sklearn.model_selection import KFold, StratifiedKFold
import shutil
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup, TrainingArguments, Trainer, AutoModelForSequenceClassification
from datasets import load_metric
import datasets
from transformers import BertModel
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
import warnings, transformers, logging, torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
os.environ["WANDB_DISABLED"] = "true"
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [13]:
class CFG:
    if iskaggle:
        input_path = '../input/data'
        model_path = '../input/patentphrasematching2'
    else:
        input_path = '/home/bhavik/projects/kaggle-patent-phrase-matching/data'
        model_path = '/home/bhavik/projects/kaggle-patent-phrase-matching'

    max_len = 70
    num_fold = 5 

In [5]:
titles = pd.read_csv(f"{CFG.input_path}/titles.csv")

In [6]:
def prep_input_tokens(df):
    return df.anchor + '[SEP]' + df.target + '[cpc]' + df.title

In [7]:
test_df = pd.read_csv(f"{CFG.input_path}/test.csv")
test_df = test_df.merge(titles, left_on='context', right_on='code')
test_df['input'] = prep_input_tokens(test_df)

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    f"{CFG.model_path}/uspppm_0",
    additional_special_tokens=['[abstract]','[claim]','[summary]','[invention]','[cpc]']
)

class InferDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        return {
        **tokenizer(inputs, max_length=CFG.max_len, padding="max_length"),
    }

In [10]:
encoded = tokenizer(test_df.iloc[0]['input'])
tokenizer.convert_ids_to_tokens(encoded['input_ids'])

['[CLS]',
 'op',
 '##c',
 'drum',
 '[SEP]',
 'inorganic',
 'photo',
 '##con',
 '##du',
 '##ctor',
 'drum',
 '[cpc]',
 'optics',
 '[SEP]']

In [11]:
encoded

{'input_ids': [2, 6393, 1943, 6608, 3, 27921, 5967, 8328, 8231, 16426, 6608, 9, 20691, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
test_predictions = []
for fold in range(CFG.num_fold):
    model = AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path}/uspppm_{fold}', num_labels=1)
    trainer = Trainer(
            model,
            tokenizer=tokenizer
        )

    test_dataset = InferDataset(test_df)
    test_outputs = trainer.predict(test_dataset)
    test_prediction = test_outputs.predictions.reshape(-1)
    test_predictions.append(test_prediction)

100%|██████████| 5/5 [00:03<00:00,  1.34it/s]
100%|██████████| 5/5 [00:03<00:00,  1.41it/s]
100%|██████████| 5/5 [00:03<00:00,  1.37it/s]
100%|██████████| 5/5 [00:02<00:00,  2.32it/s]
100%|██████████| 5/5 [00:16<00:00, 29.28it/s]

In [18]:
test_predictions_aggregated = list(np.mean(test_predictions, axis=0))
submission_df = pd.DataFrame({'id': test_df['id'], 'score': test_predictions_aggregated})
submission_df['score'][submission_df['score'] < 0] = 0.0
submission_df.to_csv('submission2.csv', index=False)