This notebook was used to perform inference on unlabelled patent claims, and was run on Google colab. The best performing NLP model was used.

In [None]:
# Mount Google Drive on Google Colab to access files
from google.colab import drive
drive.mount('/content/drive')

In [None]:

import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import TFXLNetModel, XLNetTokenizer, XLNetForSequenceClassification, AdamW
from google.colab import userdata
userdata.get('HF_TOKEN')
from keras.preprocessing.sequence import pad_sequences
import pickle

In [None]:
# Identify and specify GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Combine unlabelled data into a single file

In [None]:
df_1 = pd.read_excel(r'/content/drive/MyDrive/data/unlabeled_data/all_unlabelled_data_1.xlsx')
df_2 = pd.read_excel(r'/content/drive/MyDrive/data/unlabeled_data/all_unlabelled_data_2.xlsx')
df_3 = pd.read_excel(r'/content/drive/MyDrive/data/unlabeled_data/all_unlabelled_data_3.xlsx')

combined_df = pd.concat([df_1, df_2, df_3], ignore_index=True)

with open('all_unlabeled_claims.pkl', 'wb') as file:
    pickle.dump(combined_df, file)

Import claims data and prepare them to be inference ready (tokenizing and Tensor dataset).

In [None]:
import pickle
# Load the object from the pickle file
with open(r'/content/drive/My Drive/data/unlabeled_data/all_unlabeled_claims.pkl', 'rb') as file:
    data = pickle.load(file)

In [None]:
# Function to prepare df input data to be inference-ready

def df_to_tensor(data):

  # Extract patent claims and prepare for inference

  inputs = data['Text']
  inputs = [sentence + " [SEP] [CLS]" for sentence in inputs]                      # Special tokens to be added to end of sentences for XLNet

  # Initialize the tokenizer and convert text into tokens that correspond to XLNet's vocabulary
  tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',do_lower_case = True)
  tokenised_inputs = [tokenizer.tokenize(sent) for sent in inputs]

  MAX_LEN = 256

  # Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenised_inputs]

  # Pad our input tokens
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  # Create attention masks
  attention_masks = []
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

  # Convert data into torch tensors, the required datatype for the model

  inputs = torch.tensor(input_ids)
  masks = torch.tensor(attention_masks)

  input_data = TensorDataset(inputs,masks)

  return input_data

Inference

In [None]:
def inference(input_data,model,bs):
  input_dataloader = DataLoader(input_data,batch_size = bs)

  pred_flat_all = []

  for batch in input_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask = batch
      # Telling the model not to compute or store gradients, saving memory and speeding up validation
      with torch.no_grad():
        # Forward pass, calculate logit predictions
          output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
          logits = output.logits

      # Move logits to CPU
      logits = logits.detach().cpu().numpy()

      pred_flat = np.argmax(logits, axis=1).flatten()

      pred_flat_all.append(pred_flat)

  return pred_flat_all


The inference code block below is designed to be able to run across multiple sessions.

In [None]:

# Imports completed inference from previous session
with open(r'/content/drive/My Drive/data/all_labelled_claims.pkl', 'rb') as file:
    labels = pickle.load(file)


# Activate this line of code if running this block for the first time
#labels = []


# Imports inference model
model_path = '/content/drive/MyDrive/Colab Notebooks/Trained Models/XLNet/2e-05_0.01_32'
model = XLNetForSequenceClassification.from_pretrained(model_path,num_labels = 2)
model.cuda()
model.eval()


# Find row where last iteration stopped
start_loc = len(labels)

batch_size = 256

# Continue loop of inference
try:
  while start_loc < len(data):
    print(start_loc)
    data_subset = data.loc[start_loc:start_loc+batch_size - 1]

    input_data = df_to_tensor(data_subset)

    pred = inference(input_data,model,batch_size)

    pred = pred[0].tolist()

    labels = labels + pred

    start_loc = start_loc + batch_size
    with open(r'/content/drive/MyDrive/data/all_labelled_claims.pkl', 'wb') as file:
        pickle.dump(labels, file)

except Exception as e:
  print(f"An error occurred: {e}")

