In [None]:
# !pip install torch
# !pip install google-cloud-storage

In [5]:
# import needed packages
import numpy as np
import pandas as pd
from collections import Counter
import re
import time

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from torch.optim import Adam, SGD, AdamW

import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


# Process to read and load data from the GCP bucket

In [11]:
# import data from the GCP bucket
# data from the kaggle competition website
from google.cloud import storage

bucket_name = 'cliffm_uspto_kaggle_data'

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)


# used when the files are in a subfolder of the bucket
'''my_prefix = 'csv/' # name of the subfolder
blobs = bucket.list_blobs(delimiter = '/')

for blob in blobs:
    if(blob.name != my_prefix): # ignore the subfolder itself
        file_name = blob.name.replace(my_prefix, "")
        blob.download_to_filename(file_name) # download the file to the machine
        df = pd.read_csv(file_name) # load the data
        print(df)
'''

# use this code for when the files are not in a subfolder; i.e. in the first level of the bucket
blobs = bucket.list_blobs()

for blob in blobs:
    file_name = blob.name
    blob.download_to_filename(file_name) # download the file to the machine
    df = pd.read_csv(file_name) # load the data
    # print(df)        

In [6]:
# read in the data from the folder
patent_train = pd.read_csv('/home/jupyter/uspto_analysis/train.csv')
patent_test = pd.read_csv('/home/jupyter/uspto_analysis/test.csv')
patent_titles = pd.read_csv('/home/jupyter/uspto_analysis/titles.csv')

In [7]:
# joining of the training dataset
patents_combined = patent_train.merge(patent_titles, how = 'left', left_on = 'context', right_on = 'code')
patents_combined = patents_combined[['id', 'anchor', 'target', 'context', 'title', 'score']]

In [8]:
# joining the testing dataset
testing_combined = patent_test.merge(patent_titles, how = 'left', left_on = 'context', right_on = 'code')
testing_combined = testing_combined[['id', 'anchor', 'target', 'context', 'title']]

## Read in the DeBERTA Model

In [9]:
# Read in the DeBERTA model
model_name = 'microsoft/deberta-base'

In [10]:
# Utilization of the DeBERTA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Text Processing Procedures

In [11]:
# Combine the target and title datafields
# include a seperator between the target and title texts

patents_combined['input'] = patents_combined['target'] + tokenizer.sep_token + patents_combined['title'].apply(str.lower)
testing_combined['input'] = testing_combined['target'] + tokenizer.sep_token + testing_combined['title'].apply(str.lower)

In [12]:
patents_combined

Unnamed: 0,id,anchor,target,context,title,score,input
0,37d61fd2272659b1,abatement,abatement of pollution,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.50,abatement of pollution[SEP]furniture; domestic...
1,7b9652b17b68b7a4,abatement,act of abating,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.75,act of abating[SEP]furniture; domestic article...
2,36d72442aefd8232,abatement,active catalyst,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.25,active catalyst[SEP]furniture; domestic articl...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.50,eliminating process[SEP]furniture; domestic ar...
4,54c1e3b9184cb5b6,abatement,forest region,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0.00,forest region[SEP]furniture; domestic articles...
...,...,...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,DECORATIVE ARTS,1.00,wooden article[SEP]decorative arts
36469,42d9e032d1cd3242,wood article,wooden box,B44,DECORATIVE ARTS,0.50,wooden box[SEP]decorative arts
36470,208654ccb9e14fa3,wood article,wooden handle,B44,DECORATIVE ARTS,0.50,wooden handle[SEP]decorative arts
36471,756ec035e694722b,wood article,wooden material,B44,DECORATIVE ARTS,0.75,wooden material[SEP]decorative arts


# Setting up the training and testing datasets

In [13]:
# create training and validation datasets
    # 80% of records for training, 20% for testing
training_patents, evaluation_patents = train_test_split(patents_combined, test_size = 0.2, random_state = 42)

In [14]:
# Define a class to generate the training dataset
class TrainDataset(Dataset):
    def __init__(self, patents_combined):
        self.input = patents_combined['input'].values.astype(str)
        self.anchor = patents_combined['anchor'].values.astype(str)
        self.label = patents_combined['score'].values
        
    def __len__(self):
        return len(self.input)

    def __getitem__(self, item):
        inputs = self.input[item]
        anchor = self.anchor[item]
        label = self.label[item]
        model_inputs = tokenizer(inputs, anchor, max_length = 100, padding = 'max_length', truncation = True)
        return {**model_inputs, 'label':torch.as_tensor(label, dtype = torch.float)}

In [15]:
# Define a class to generate the validation dataset
class EvalDataset(Dataset):
    def __init__(self, patents_combined):
        self.input = patents_combined['input'].values.astype(str)
        self.anchor = patents_combined['anchor'].values.astype(str)
        
    def __len__(self):
        return len(self.input)

    def __getitem__(self, item):
        inputs = self.input[item]
        anchor = self.anchor[item]
        model_inputs = tokenizer(inputs, anchor, max_length = 100, padding = 'max_length', truncation = True)
        return {**model_inputs}

In [16]:
# Define a function to generate the metrics that will be used to evaluate the model performance
# generates the scores between the two text phrases
def metrics(eval_predictions):
    predictions, labels = eval_predictions
    predictions = predictions.reshape(len(predictions))
    return {'pearson': np.corrcoef(predictions, labels)[0][1]}

# Setup the DeBERTA Model
### Defining the hyperparameters and measurements

In [17]:
# define the deberta model that will be used
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

In [18]:
# specify the pearson correlation metric to be used for evaluating the model
# define the size of the batch - 128
# define the epoch evaluation strategy
model_metric = 'pearson'
batch_size = 128
arguments = TrainingArguments('model_test', evaluation_strategy = 'epoch', save_strategy='epoch', learning_rate = 2e-5, per_device_train_batch_size=batch_size,
                              per_device_eval_batch_size=batch_size*2, num_train_epochs = 4, weight_decay = 0.01, load_best_model_at_end= True,
                              metric_for_best_model= model_metric, save_total_limit=1)

In [19]:
# define the training and validation datasets
train_dataset = TrainDataset(training_patents)
validation_dataset = TrainDataset(evaluation_patents)

### Pass in hyperparameters for the model to be trained on the training dataset

In [20]:
# Define the trainer
trainer = Trainer(model, arguments, train_dataset = train_dataset, eval_dataset = validation_dataset,
                  tokenizer = tokenizer, compute_metrics = metrics)

In [1]:
# run the trainer
trainer.train()

NameError: name 'trainer' is not defined

### setup the validation dataset

In [None]:
validation_data = EvalDataset(testing_combined)

In [None]:
outputs = trainer.predict(validation_data).predictions.astype(float)
outputs

In [None]:
outputs = np.clip(outputs,0,1)

# Export the predictions to a csv

In [None]:
import datasets

In [None]:
submission = datasets.Dataset.from_dict({
    'id': testing_combined['id'],
    'score': outputs.flatten()
})

In [None]:
submission.to_csv('submission.csv', index = False)

# Patent EDA

In [None]:
# --- Description of the data fields ---
# patent_train and patent_test
    # id - unique identifier for a pair of phrases
    # anchor - first phrase
    # target - second phrase
    # context - CPC classification which indicates the context which the similarity is to be scored
    # score - similarity between the two phrases
    
# patent_titles
    # code - hierarchical code used to categorize the patent; corresponds to the context field in patent_train and patent_test dataframe
    # title - description of the code field
    # section - first symbol in the title field; ranges from A - H and Y
    # class - 2 digit class
    # subclass - 1 letter code subclass
    # group - 1-3 digit group code value
    # main_group - 2+ sigit main or subgroup after the / symbol
    # EXAMPLE: patent_titles.loc[3,'code'] = 'A01B1/00'
        # title = 'Hand tools (edge trimmers for lawns A01G3/06  {; machines for working soil A01B35/00; making hand tools B21D})'
        # section = A
        # class = 1.0
        # subclass = B
        # group = 1.0
        # main_group = 00
        
# --- Description of the data fields ---

patent_train.head()
patent_test.head()
patent_titles.head()
# patent_cpc.head()

patent_train.shape

# data fields
patent_train.columns
patent_titles.columns
# patent_cpc.columns

# function to view the first 10 columns of the titles dataframe
def view_data(dataframe, *args):
    col_data_list = []
    for arg in args:
        col_data = dataframe.iloc[0:10, arg]
        col_header = dataframe.columns[arg]
        col_data_list.append((col_header, col_data))
    return col_data_list

patent_titles.shape
view_data(patent_titles,range(0,7))
patent_titles.iloc[0:10,0:6]

# patent_cpc.shape
# view_data(patent_cpc,0,1,2,3,4,5,6,7)