In [None]:
import os

INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/' # Define the input and output directories
OUTPUT_DIR = './'
# Check if the output directory exists, and create it if it doesn't
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
class CFG:
    # number of worker threads to use for data loading
    num_workers = 4
    # path to input data directory
    input_path = "../input/us-patent-phrase-to-phrase-matching/"
    # path to configuration file
    config_path = input_path + "config.pth"
    # path to pre-trained model
    model_path = "../input/uspppm-debertv3large-5folds-v2/"
    # batch size for training
    batch_size = 32
    # dropout probability for fully-connected layer
    fc_dropout = 0.2
    # number of target classes (in this case, 1 for regression)
    target_size = 1
    # maximum sequence length for input tokens
    max_len = 133
    # random seed for reproducibility
    seed = 42
    # number of folds for cross-validation
    n_fold = 4
    # indices of training folds
    trn_fold = [0, 1, 2, 3]

In [None]:
%%capture
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import tokenizers
import transformers,datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig,TrainingArguments, Trainer
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter

In [None]:
# A function to calculate the Pearson Correaltion coefficient of the "anchor" and "target" features
def get_score(y_true, y_pred):
    """
    Computes the Pearson correlation coefficient between the true and predicted labels.

    Args:
    - y_true: array-like of shape (n_samples,) - True labels of the data.
    - y_pred: array-like of shape (n_samples,) - Predicted labels of the data.

    Returns:
    - score: float - The Pearson correlation coefficient between the true and predicted labels.
    """
    score = sp.stats.pearsonr(y_true, y_pred)[0]  # Compute Pearson correlation coefficient
    return score

In [None]:
def get_logger(filename=OUTPUT_DIR+'train'):
    # create a logger object with the name of the current module
    logger = getLogger(__name__)
    # set the logging level to INFO
    logger.setLevel(INFO)
    # create a StreamHandler to output log messages to the console
    handler1 = StreamHandler()
    # set the format for the log messages
    handler1.setFormatter(Formatter("%(message)s"))
    # create a FileHandler to output log messages to a file
    handler2 = FileHandler(filename=f"{filename}.log")
    # set the format for the log messages
    handler2.setFormatter(Formatter("%(message)s"))
    # add both handlers to the logger object
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    # return the logger object
    return logger

In [None]:

LOGGER = get_logger()

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
# oof_df = pd.read_pickle(CFG.path+'oof_df.csv')
# labels = oof_df['score'].values
# preds = oof_df['pred'].values
# score = get_score(labels, preds)
# LOGGER.info(f'CV Score: {score:<.4f}')

test_df = pd.read_csv(f"{CFG.input_path}test.csv") # Import the public test data set
titles = pd.read_csv('../input/cpc-codes/titles.csv') # Import the CPC Classification file
test_df = test_df.merge(titles, left_on='context', right_on='code') # Merge the test and CPC Classification datasets

In [None]:
test_df['input'] = test_df['title']+'[SEP]'+test_df['anchor'] # Creates a new column 'input' to be used in further analysis

In [None]:
# Load tokenizer from the specified pre-trained model path
tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path}uspppm_0')
# This piggybacks on the pre-trained DeBERTa model

In [None]:
class InferDataset(Dataset):
    def __init__(self, df):
        # convert input and target columns to string type and store them in the inputs and targets arrays respectively
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)

    def __len__(self):
        # return the number of inputs
        return len(self.inputs)

    def __getitem__(self, item):
        # get the inputs and targets for the given item
        inputs = self.inputs[item]
        targets = self.targets[item]
        
        # return a dictionary containing the tokenized inputs and targets
        return {
        **tokenizer( inputs, targets )
    }


In [None]:
def compute_score(eval_pred):
    # extract predictions and labels from the input
    predictions, labels = eval_pred
    # reshape predictions to be of length equal to the number of predictions
    predictions = predictions.reshape(len(predictions))
    # compute Pearson correlation coefficient between predictions and labels
    pearson_corr = np.corrcoef(predictions, labels)[0][1]
    # return dictionary containing the computed score
    return {'pearson': pearson_corr}


In [None]:
# initialize an empty list to store the predictions for each fold
predictions = []

# loop over each fold
for fold in range(CFG.n_fold):
    
    # create an instance of the InferDataset class using the test data
    te_dataset = InferDataset(test_df)
    
    # load the trained model for the current fold
    model = AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path}uspppm_{fold}', num_labels=1)
    
    # create a Trainer instance with the loaded model and tokenizer
    trainer = Trainer(
            model,
            tokenizer=tokenizer
        )
    
    # make predictions on the test data using the Trainer instance
    outputs = trainer.predict(te_dataset)
    
    # reshape the predictions to a 1D array
    prediction = outputs.predictions.reshape(-1)
    
    # append the predictions to the list of predictions
    predictions.append(prediction)
    
# take the mean of the predictions across all folds
predictions = np.mean(predictions, axis=0)

# create a new dataset containing the submission data
submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': predictions,
})


In [None]:
display(submission)

In [None]:
submission.to_csv('submission.csv', index=False)
