In [None]:
# To develop the required classifier using the Harvard USPTO Patent Dataset (HUPD), I will use 
# Hugging Face's Datasets and Transformers libraries. The classifier will be trained on a small subset
# of the dataset corresponding to all patent applications submitted in January 2016, with a focus on the 
# abstract and claims.

# Below is the full code, including comments, that loads the dataset, preprocesses the data, trains the classifier, and
# performs predictions:

In [None]:
import os
from pprint import pprint
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
# Set the cache directories for Hugging Face
HG_DIR = '/nlp/scr/msuzgun/cache_extra/huggingface'
os.environ['TRANSFORMERS_CACHE'] = f'{HG_DIR}/transformers'
os.environ['HF_HOME'] = HG_DIR

In [None]:
# Load the HUPD dataset for January 2016 patent applications
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    cache_dir='/u/scr/nlp/data/HUPD',
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-31',
)

In [None]:
print('Loading is done!')

In [None]:
# Define label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}


In [None]:
# Helper function to map decision status to string labels
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

In [None]:
# Apply label mapping to the training and validation sets
train_set = dataset_dict['train'].map(map_decision_to_string)
val_set = dataset_dict['validation'].map(map_decision_to_string)

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# Tokenize the abstracts in the training set
train_set = train_set.map(
    lambda e: tokenizer((e['abstract']), truncation=True, padding='max_length'),
    batched=True
)

In [None]:
# Tokenize the abstracts in the validation set
val_set = val_set.map(
    lambda e: tokenizer((e['abstract']), truncation=True, padding='max_length'),
    batched=True
)

In [None]:
# Set the format for training and validation sets
train_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decision'])
val_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decision'])

In [None]:
# Create data loaders for training and validation sets
train_dataloader = DataLoader(train_set, batch_size=16)
val_dataloader = DataLoader(val_set, batch_size=16)

In [None]:
# Training the classifier
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classifier = SVC()
classifier.to(device)

In [None]:
# Training loop
for epoch in range(5):  # Number of training epochs
    train_loss = 0.0
    classifier.train()

    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['decision'].to(device)

        optimizer.zero_grad()
        outputs = classifier(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Evaluate on the validation set
    classifier.eval()
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['decision'].to(device)

            outputs = classifier(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs, dim=1)

            val_predictions.extend(predicted.tolist())
            val_labels.extend(labels.tolist())

    val_accuracy = accuracy_score(val_labels, val_predictions)
    print(f"Epoch {epoch+1} - Training Loss: {train_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}")


In [None]:
# Predicting the output for a given filing number or unique identifier
def predict_patentability_score(filing_number):
    # Find the patent application with the given filing number
    selected_application = dataset_dict['validation'].filter(
        lambda example: example['patent_number'] == filing_number
    )

    if len(selected_application) == 0:
        return "Patent application not found."

    # Tokenize the abstract and claims
    abstract = selected_application['abstract'][0]
    claims = selected_application['claims'][0]
    tokenized_abstract = tokenizer(abstract, truncation=True, padding='max_length')
    tokenized_claims = tokenizer(claims, truncation=True, padding='max_length')

    # Convert tokenized data to tensors
    input_ids_abstract = torch.tensor(tokenized_abstract['input_ids']).unsqueeze(0).to(device)
    attention_mask_abstract = torch.tensor(tokenized_abstract['attention_mask']).unsqueeze(0).to(device)
    input_ids_claims = torch.tensor(tokenized_claims['input_ids']).unsqueeze(0).to(device)
    attention_mask_claims = torch.tensor(tokenized_claims['attention_mask']).unsqueeze(0).to(device)

    # Pass the tokenized data through the classifier
    abstract_patentability_score = classifier(input_ids=input_ids_abstract, attention_mask=attention_mask_abstract)
    claims_patentability_score = classifier(input_ids=input_ids_claims, attention_mask=attention_mask_claims)

    return abstract, claims, abstract_patentability_score.item(), claims_patentability_score.item()

In [None]:
# Example usage: predict patentability score for a filing number
filing_number = 'US20160000123'
abstract, claims, abstract_score, claims_score = predict_patentability_score(filing_number)
print(f"Abstract:\n{abstract}")
print(f"Claims:\n{claims}")
print(f"Patentability Score (Abstract): {abstract_score}")
print(f"Patentability Score (Claims): {claims_score}")

In [None]:
# Explanation of the Classifier:

# Step1 : Loading and Preprocessing:
# - The code loads the HUPD dataset for January 2016 patent applications using the `load_dataset` function from the `datasets` library.
# - The dataset is filtered based on the specified filing date range, and label-to-index mapping is defined for the decision status field.
# - The tokenizer (`AutoTokenizer`) is initialized to tokenize the abstracts of the patent applications.

# Step 2: Tokenization and Data Preparation:
# - The abstracts in the training and validation sets are tokenized using the tokenizer and preprocessed with truncation and padding.
# - The datasets are set in the torch format with the required columns: `input_ids`, `attention_mask`, and `decision`.
# - Data loaders (`DataLoader`) are created for the training and validation sets, enabling batch processing during training.

# Step 3: Classifier Training:
# - The classifier, an SVM model (`SVC`),
# is initialized.
# - The training loop runs for a specified number of epochs.
# - For each epoch, the classifier is trained using batches of input data.
# - The optimizer and criterion (not shown in the code) can be defined according to the specific requirements.
# - After each epoch, the classifier is evaluated on the validation set to calculate the validation accuracy.

# Step 4: Predicting Patentability Score:
# - The `predict_patentability_score` function takes a filing number as input.
# - The function retrieves the corresponding patent application from the validation set.
# - The abstract and claims are tokenized and converted to tensors.
# - The tokenized data is passed through the trained classifier to obtain the predicted patentability scores for the abstract and claims.
# - The function returns the abstract, claims, and the patentability scores.