In [1]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

## Steps

1. prepare dataset
2. load pretrained Tokenizer, call it with dataset -> encoding
3. build PyTorch Dataset with encodings
4. Load pretrained Model
5. a. Load Trainer and train it
   b. or use naive Pytorch training pipeline

## Import libraries

In [4]:
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# Standard PyTorch DataLoader
from torch.utils.data import DataLoader

from transformers import pipeline

import torch
import torch.nn.functional as F


## Download Dataset (USPTO)

use the `load_dataset` function to load all the patent applications that were filed to the USPTO in January 2016. We specify the date ranges of the training and validation sets as January 1-21, 2016 and January 22-31, 2016, respectively.

In [6]:
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)

print('Loading is done!')

Downloading builder script:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

Downloading and preparing dataset hupd/sample to /root/.cache/huggingface/datasets/HUPD___hupd/sample-e81e49c78dccc371/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142...
Loading dataset with config: PatentsConfig(name='sample', version=0.0.0, data_dir='sample', data_files={'train': ['https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather']}, description='Patent data from January 2016, for debugging')


Downloading data:   0%|          | 0.00/6.67M [00:00<?, ?B/s]

Using metadata file: /root/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710


Downloading data:   0%|          | 0.00/388M [00:00<?, ?B/s]

Reading metadata file: /root/.cache/huggingface/datasets/downloads/bac34b767c2799633010fa78ecd401d2eeffd62eff58abdb4db75829f8932710
Filtering train dataset by filing start date: 2016-01-01
Filtering train dataset by filing end date: 2016-01-21
Filtering val dataset by filing start date: 2016-01-22
Filtering val dataset by filing end date: 2016-01-31


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset hupd downloaded and prepared to /root/.cache/huggingface/datasets/HUPD___hupd/sample-e81e49c78dccc371/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Loading is done!


In [8]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 16153
    })
    validation: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 9094
    })
})

In [16]:
train_dict = dataset_dict['train']
print(len(train_dict))
type(train_dict)

16153


datasets.arrow_dataset.Dataset

In [11]:
validation_dict = dataset_dict['validation']
print(len(validation_dict))

9094

In [34]:
train_dict[:1]

{'patent_number': ['13261748'],
 'decision': ['ACCEPTED'],
 'title': ['MINI-OPTICAL NETWORK TERMINAL (ONT)'],
 'abstract': ['The present invention relates to passive optical network (PON), and in particular, to an optical network terminal (ONT) in the PON system. In one embodiment, the optical network terminal includes a first interface coupled to a communications network, a second interface coupled to a network client and a processor including a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network.'],
 'claims': ['1. A compact optical network terminal, comprising: a first interface coupled to a communications network; a second interface coupled to a network client, wherein the second interface is a network connectivity dongle with an optical transceiver at one end; and a processor including a circuitry and a memory coupled

In [35]:
train_dict[0]['claims']

'1. A compact optical network terminal, comprising: a first interface coupled to a communications network; a second interface coupled to a network client, wherein the second interface is a network connectivity dongle with an optical transceiver at one end; and a processor including a circuitry and a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network thereby reducing the unnecessary splitting of equal upstream wavelengths to all the network clients in the network. 2. The optical network terminal of claim 1, wherein the first interface includes an optical module that receives optical signals via the optical fiber link and converts the optical signals to electrical signals. 3. The optical network terminal of claim 2, wherein the optical module is selectively configurable to support two or more of a broadband passive optical 

In [37]:
train_dict[0]['abstract']

'The present invention relates to passive optical network (PON), and in particular, to an optical network terminal (ONT) in the PON system. In one embodiment, the optical network terminal includes a first interface coupled to a communications network, a second interface coupled to a network client and a processor including a memory coupled to the first interface and to the second interface, wherein the processor is capable of converting optical signals to electric signals, such that the network client can access the communications network.'

In [17]:
# Print info about the sizes of the train and validation sets
print(f'Train dataset size: {dataset_dict["train"].shape}')
print(f'Validation dataset size: {dataset_dict["validation"].shape}')

Train dataset size: (16153, 14)
Validation dataset size: (9094, 14)


## Pre-Processing the data

the label-to-index mapping for the decision status field by assigning the decision status labels to the class indices.

In [18]:
# Label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}

# Helper function
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

re-label the decision status fields of the examples in the training and validation sets

In [19]:
# Re-labeling/mapping.
train_set = dataset_dict['train'].map(map_decision_to_string)
val_set = dataset_dict['validation'].map(map_decision_to_string)

Map:   0%|          | 0/16153 [00:00<?, ? examples/s]

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [33]:
# testing
train_set[:1]

{'decision': tensor([1]),
 'input_ids': tensor([[  101,  1996,  2556, 11028, 14623,  2000, 13135,  9380,  2897,  1006,
          13433,  2078,  1007,  1010,  1998,  1999,  3327,  1010,  2000,  2019,
           9380,  2897,  5536,  1006,  2006,  2102,  1007,  1999,  1996, 13433,
           2078,  2291,  1012,  1999,  2028,  7861,  5092, 21341,  1010,  1996,
           9380,  2897,  5536,  2950,  1037,  2034,  8278, 11211,  2000,  1037,
           4806,  2897,  1010,  1037,  2117,  8278, 11211,  2000,  1037,  2897,
           7396,  1998,  1037, 13151,  2164,  1037,  3638, 11211,  2000,  1996,
           2034,  8278,  1998,  2000,  1996,  2117,  8278,  1010, 16726,  1996,
          13151,  2003,  5214,  1997, 16401,  9380,  7755,  2000,  3751,  7755,
           1010,  2107,  2008,  1996,  2897,  7396,  2064,  3229,  1996,  4806,
           2897,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
 

the abstract section of the patent applications

In [31]:
# Focus on the abstract section and tokenize the text using the tokenizer.
_SECTION_ = 'abstract'

In [32]:
# Training set
train_set = train_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

Map:   0%|          | 0/16153 [00:00<?, ? examples/s]

In [22]:
# Validation set
val_set = val_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [38]:
train_set[:1]

{'decision': tensor([1]),
 'input_ids': tensor([[  101,  1996,  2556, 11028, 14623,  2000, 13135,  9380,  2897,  1006,
          13433,  2078,  1007,  1010,  1998,  1999,  3327,  1010,  2000,  2019,
           9380,  2897,  5536,  1006,  2006,  2102,  1007,  1999,  1996, 13433,
           2078,  2291,  1012,  1999,  2028,  7861,  5092, 21341,  1010,  1996,
           9380,  2897,  5536,  2950,  1037,  2034,  8278, 11211,  2000,  1037,
           4806,  2897,  1010,  1037,  2117,  8278, 11211,  2000,  1037,  2897,
           7396,  1998,  1037, 13151,  2164,  1037,  3638, 11211,  2000,  1996,
           2034,  8278,  1998,  2000,  1996,  2117,  8278,  1010, 16726,  1996,
          13151,  2003,  5214,  1997, 16401,  9380,  7755,  2000,  3751,  7755,
           1010,  2107,  2008,  1996,  2897,  7396,  2064,  3229,  1996,  4806,
           2897,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
 

the claims section of the patent applications

In [41]:
# Focus on the abstract section and tokenize the text using the tokenizer.
_SECTION1_ = 'claims'

In [42]:
# Training set
train_set = train_set.map(
    lambda e: tokenizer((e[_SECTION1_]), truncation=True, padding='max_length'),
    batched=True)

Map:   0%|          | 0/16153 [00:00<?, ? examples/s]

In [43]:
# Validation set
val_set = val_set.map(
    lambda e: tokenizer((e[_SECTION1_]), truncation=True, padding='max_length'),
    batched=True)

Map:   0%|          | 0/9094 [00:00<?, ? examples/s]

In [44]:
train_set[:1]

{'decision': tensor([1]),
 'input_ids': tensor([[  101,  1015,  1012,  1037,  9233,  9380,  2897,  5536,  1010,  9605,
           1024,  1037,  2034,  8278, 11211,  2000,  1037,  4806,  2897,  1025,
           1037,  2117,  8278, 11211,  2000,  1037,  2897,  7396,  1010, 16726,
           1996,  2117,  8278,  2003,  1037,  2897, 20831, 11947,  2571,  2007,
           2019,  9380,  9099,  3401, 16402,  2012,  2028,  2203,  1025,  1998,
           1037, 13151,  2164,  1037,  4984,  2854,  1998,  1037,  3638, 11211,
           2000,  1996,  2034,  8278,  1998,  2000,  1996,  2117,  8278,  1010,
          16726,  1996, 13151,  2003,  5214,  1997, 16401,  9380,  7755,  2000,
           3751,  7755,  1010,  2107,  2008,  1996,  2897,  7396,  2064,  3229,
           1996,  4806,  2897,  8558,  8161,  1996, 14203, 14541,  1997,  5020,
          13909, 29263,  2000,  2035,  1996,  2897,  7846,  1999,  1996,  2897,
           1012,  1016,  1012,  1996,  9380,  2897,  5536,  1997,  4366,  1015,
 

In [48]:
# Set the format
train_set.set_format(type='torch',
    columns=['input_ids', 'attention_mask', 'decision'])

val_set.set_format(type='torch',
    columns=['input_ids', 'attention_mask', 'decision'])

## Dataloader to create the training set and validation set loaders

In [49]:
# train_dataloader and val_data_loader
train_dataloader = DataLoader(train_set, batch_size=16)
val_dataloader = DataLoader(val_set, batch_size=16)

In [50]:
# Get the next batch
batch = next(iter(train_dataloader))
# Print the ids
pprint(batch['input_ids'])
# Print the labels
pprint(batch['decision'])

tensor([[  101,  1015,  1012,  ...,     0,     0,     0],
        [  101,  1015,  1012,  ...,  1996,  3653,   102],
        [  101,  1015,  1012,  ..., 16726,  1996,   102],
        ...,
        [  101,  1015,  1012,  ...,  1012,  1996,   102],
        [  101,  1015,  1012,  ...,  2034, 28688,   102],
        [  101,  1015,  1012,  ...,  2000,  4366,   102]])
tensor([1, 1, 2, 1, 0, 1, 2, 0, 1, 1, 1, 2, 2, 2, 0, 1])


In [51]:
# Print the input and output shapes
input_shape = batch['input_ids'].shape
output_shape = batch['decision'].shape
print(f'Input shape: {input_shape}')
print(f'Output shape: {output_shape}')

Input shape: torch.Size([16, 512])
Output shape: torch.Size([16])


In [52]:
# A helper function that converts ids into tokens
def convert_ids_to_string(tokenizer, input):
    return ' '.join(tokenizer.convert_ids_to_tokens(input))

print an example in the batch

In [53]:
# Print the example
pprint(convert_ids_to_string(tokenizer,batch['input_ids'][1]))

('[CLS] 1 . a method comprising : using a first reader to take a first reading '
 'of an inherent disorder feature of a tag ; using at least a second reader to '
 'take at least a second reading of the inherent disorder feature of the tag ; '
 'matching the first reading with at least the second reading ; determining '
 'one or more acceptance criteria , wherein at least one of the acceptance '
 'criteria is based on whether the first reading and the second reading match '
 'within a pre ##de ##ter ##mined threshold ; accepting the tag if the '
 'acceptance criteria are met ; and recording a finger ##print for the tag if '
 'the tag was accepted . 2 . the method of claim 1 , wherein determining one '
 'or more acceptance criteria further comprises : determining an acceptance '
 'criterion based on an individual reading . 3 . the method of claim 2 , '
 'wherein determining an acceptance criterion based on an individual reading '
 'comprises determining an acceptance criterion based on a

## Load Pretrained Model

In [54]:
model_name = "AI-Growth-Lab/PatentSBERTa"

References:

1. https://colab.research.google.com/drive/1_ZsI7WFTsEO0iu_0g3BLTkIkOUqPzCET?usp=sharing#scrollTo=B5wxZNhXdUK6

2. https://huggingface.co/AI-Growth-Lab/PatentSBERTa

3. https://huggingface.co/anferico/bert-for-patents

4. https://huggingface.co/transformers/v3.2.0/custom_datasets.html