In [None]:
%%capture
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import the relevant packages
import pandas as pd
import numpy as np
import statistics
import matplotlib.pyplot as plt
from re import search
import sklearn as sk
# from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset,TensorDataset, DataLoader
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import BertTokenizer, BertModel, AutoTokenizer,AutoModel
import logging
import shutil

os.environ["WANDB_DISABLED"] = "true" # Prevent training data performance logging to prevent disk overutilization and also to conserve RAM.

In [None]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/' # Establishes the working directory
    model_path = '../input/deberta-v3-large/deberta-v3-large/'  # NLP Model
    learning_rate = 2e-5 # Determines how much the model parameters are updated during each iteration of training
    weight_decay = 0.01 # Penalty factor added to the cross-entropy loss to prevent overfitting
    num_fold = 5 # Number of times the model is cross-validated on random sub-samples of the training data
    epochs = 5 # Complete passes through the training data
    batch_size = 16 # An integer that specifies the number of samples to use in each batch during training
    seed = 42 # Set seed for consistent reproducability

In [None]:
# Import the relevant data sets
train = pd.read_csv(f"{CFG.input_path}train.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
train = train.merge(titles, left_on='context', right_on='code')

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    data.loc[:, "bins"] = pd.cut(# bins the scores into 5 equal-width bins (as specified by the bins=5 argument) and assigns each row 
                                # to one of these bins. The resulting "bins" column is a categorical variable that will be used 
                                # to ensure that each fold contains roughly the same distribution of scores
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    # ensures that each fold contains roughly the same distribution of values for a given categorical variable (in this case, the "bins" column)
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [None]:
context = train['context'].unique() # Create a numpy array to store the distinct context identifiers
print(context.shape)
# print(context)

anchor = train['anchor'].unique() # Create a numpy array to store the distinct anchors
print(type(anchor))
print("Anchor unique values")
print(anchor.shape)

target = train['target'].unique() # Create a numpy array to store the distinct targets
print(target.shape)

In [None]:
big_anchor = max(anchor, key = len) # Find the biggest string in the 'anchor' array
short_anchor = min(anchor, key = len) # Find the shortest string in the 'anchor' array
#median_string = statistics.median(len(anchor))
#avg_string = mean(anchor, key = len)
 
# printing results
print("\nLongest anchor: ", big_anchor)
print("\nLength of longest anchor:", len(big_anchor))
print("\nShortest anchor: ", short_anchor)
print("\nLength of shortest anchor:", len(short_anchor))
# print("\nMedian string: " + median_string)
# print("\nLength of median string:", len(median_string))

In [None]:
big_target = max(target, key = len) # Find the biggest string in the 'target' array
short_target = min(target, key = len) # Find the shortest string in the 'target' array

print("\nLongest target: ", big_target)
print("\nLength of longest target:", len(big_target))
print("\nShortest target: ", short_target)
print("\nLength of shortest target:", len(short_target))

In [None]:
print("\nAnchor unique values")
print(anchor.shape) # Find out the number of unique anchors
print("\nTarget unique values")
print(target.shape) # Find out the number of unique targets

In [None]:
train['input'] = train['title']+' '+train['anchor'] # Concatenate the title and anchor columns
train = create_folds(train, CFG.num_fold) # Apply the cross-validation by binning the "score" column

In [None]:
%%capture
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path) # Initialize the Hugging Face Transformer by calibrating it 
                                                         # with tuned parameters from the configuration (CFG) class
print("\n Tokenizer Activated!")

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df):
        # Set the inputs to be the 'input' column of the dataframe as a numpy array
        self.inputs = df['input'].values.astype(str)
        # Set the targets to be the 'target' column of the dataframe as a numpy array
        self.targets = df['target'].values.astype(str)
        # Set the label to be the 'score' column of the dataframe as a numpy array
        self.label = df['score'].values

    def __len__(self):
        # Return the number of inputs in the dataset
        return len(self.inputs)

    def __getitem__(self, item):
        # Get the inputs for the given item
        inputs = self.inputs[item]
        # Get the targets for the given item
        targets = self.targets[item]
        # Get the label for the given item
        label = self.label[item]
        
        # Use the tokenizer to encode the inputs and targets
        # The ** operator is used to unpack the resulting dictionary
        return {
            **tokenizer( inputs, targets ),
            # Add the label as a key in the dictionary with a float32 datatype
            'label':label.astype(np.float32)
        }


In [None]:
def compute_metrics(eval_pred):
    # extract predictions and labels from the input
    predictions, labels = eval_pred
    # reshape predictions to be of length equal to the number of predictions
    predictions = predictions.reshape(len(predictions))
    # compute Pearson correlation coefficient between predictions and labels
    pearson_corr = np.corrcoef(predictions, labels)[0][1]
    # return dictionary containing the computed metrics
    return {'pearson': pearson_corr}

In [None]:
# create an empty dataframe to store the final predictions
sample_sub = pd.DataFrame()

# iterate over each fold
for fold in range(CFG.num_fold):
    
    # split the data into training and validation sets for the current fold
    tr_data = train[train['fold']!=fold].reset_index(drop=True)
    va_data = train[train['fold']==fold].reset_index(drop=True)
    
    # create training and validation datasets using the TrainDataset class defined earlier
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)
    
    # set the training arguments for the Trainer class
    args = TrainingArguments(
        output_dir=f"/tmp/uspppm",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model="pearson",
        load_best_model_at_end=True,
    )
    
    # initialize the model for sequence classification and create a Trainer instance
    model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    # train the model
    trainer.train()
    
    # remove the temporary directory created during training
    shutil.rmtree(f"/tmp/uspppm")
    
    # save the model weights for the current fold
    trainer.save_model(f"uspppm_{fold}")
    
    # generate predictions on the validation set using the trained model
    outputs = trainer.predict(va_dataset)
    predictions = outputs.predictions.reshape(-1)
    
    # add the predictions to the validation data and store it in the sample_sub dataframe
    va_data['preds'] = predictions
    sample_sub = pd.concat([sample_sub, va_data])

In [None]:
# Extract predictions and labels from the sample submission dataframe
predictions = sample_sub['preds'].values
label = sample_sub['score'].values

# Create a tuple with predictions and labels
eval_pred = predictions, label

# Call the `compute_metrics` function to compute the Pearson correlation coefficient between the predictions and the labels
compute_metrics(eval_pred)

In [None]:
sample_sub.to_csv('sample_sub.csv') # Compile the trained output in the competition prescribed format