## Patent classification with an LSTM

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.layers import Bidirectional, Dense, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

# Libraries to import and process the data set
from Utilities import directories
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime

# Stuff that makes the notebook look nicer
import warnings
warnings.filterwarnings('ignore')

In [2]:
### --------- Load and transform data ---------- ###
with open(directories.data + "/example_extract.txt", "rb") as file:   # Unpickling
    data = pickle.load(file)

# Transfer data to pandas ds and set model output (classes) as categorical
data_df = pd.DataFrame(data, index=['Description', 'CPC_label']).transpose()
data_df['CPC_label'] = pd.Categorical(data_df['CPC_label'])
# Transform output to numeric
data_df['CPC_code'] = data_df['CPC_label'].cat.codes
# Shuffle and split into train and test
buffer_size = 10000
data_labels =to_categorical(data_df['CPC_code'])
data_tensor = tf.data.Dataset.from_tensor_slices((data_df["Description"], data_labels))
data_tensor = data_tensor.shuffle(buffer_size)
train = data_tensor.take(int(0.9*len(data_tensor)))
test = data_tensor.skip(int(0.9*len(data_tensor)))
# Batch the data set
batch_size = 1
train = train.batch(batch_size)
test = test.batch(batch_size)
# Print number of batches
print(len(train), len(test))

43 5


In [3]:
# Take a look at the first train batch
for example, label in train.take(1):
    print('texts: ', example.numpy()[0])
    print('labels: ', label.numpy()[0])

texts:  b'<p id="p-0002" num="0001">[0001] The present invention relates to compositions and methods for enhancing the topical application of a benefit agent. The compositions are powder-to-liquid particles comprising a liquid core that is substantially free of water and comprises a polar liquid that has a percent surface polarity of at least 24%, at least one basic benefit or active agent and at least one acidic solubility enhancing agent surrounded by a shell comprising hydrophobic particles. The particles are stable in dry form and yet quickly transform into a liquid or cream-like form when subjected to shear. They can be advantageously formulated with other ingredients, particularly those unstable in the presence of water, into personal care compositions.</p><h4>BACKGROUND OF THE INVENTION</h4><p id="p-0003" num="0002">[0002] It is known that in the presence of a hydrophobic powder, such as a hydrophobic silicon dioxide powder (silicone-coated silica powder), water can be dispersed

In [4]:
### --------- Tokenization and Encoding ---------- ###
VOCAB_SIZE=10000 # Number of unique tokens that will be generated.
# If there are more unique values in the input than the vocab size, the most frequent terms are used.
# Everything else is tokenized to UNK.
# text is lower cased and stripped from punctuation
encoder = TextVectorization(max_tokens=VOCAB_SIZE, name="token_encoder")
# fits the layer to the dataset
# When this layer is adapted, it will analyze the dataset,
# determine the frequency of individual string values, and create a 'vocabulary' from them.
encoder.adapt(train.map(lambda text, label: text))
# show excerpt
vocab = np.array(encoder.get_vocabulary())
print(vocab[:20])
print(len(vocab)) # vocab size changes because train has different constituent after each shuffle

['' '[UNK]' 'the' 'of' 'a' 'in' 'and' 'to' 'or' 'is' 'as' 'for' 'be'
 'with' 'by' 'td' 'at' 'an' 'about' 'from']
10000


In [5]:
# Check quality of tokenization
for example, label in train.take(1):
    encoded_example = encoder(example)[:3].numpy()
    print(encoded_example)
    for n in range(3):
        print("Original: ", example[n].numpy())
        print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
        print()

[[7951 3394 1651 ... 1124 3962    1]]
Original:  b'<h4>RELATED APPLICATIONS</h4><p id="p-0002" num="0001">[0001] This application is a continuation of U.S. patent application Ser. No. 14/492,184, filed Sep. 22, 2014, entitled \xe2\x80\x9cSystems and Methods for Treatment of Acne Vulgaris and Other Conditions with a Topical Nitric Oxide Delivery System,\xe2\x80\x9d by Nicholas V. Perricone, which is a continuation of U.S. patent application Ser. No. 13/801,005, filed Mar. 13, 2013, entitled \xe2\x80\x9cSystems and Methods for Treatment of Acne Vulgaris and Other Conditions with a Topical Nitric Oxide Delivery System,\xe2\x80\x9d by Nicholas V. Perricone, which is a continuation-in-part of U.S. patent application Ser. No. 13/623,008, filed Sep. 19, 2012, entitled \xe2\x80\x9cSystems and Methods for Treatment of Acne Vulgaris and Other Conditions with a Topical Nitric Oxide Delivery System,\xe2\x80\x9d by Nicholas V. Perricone, each of which is incorporated herein by reference in its enti

InvalidArgumentError: slice index 1 of dimension 0 out of bounds. [Op:StridedSlice] name: strided_slice/

In [None]:
### --------- Setup model ---------- ###
model = tf.keras.Sequential([
    encoder,
    Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True,
        name="embedding"),
    Bidirectional(tf.keras.layers.LSTM(64)),
    Dense(64, activation='relu', name="pooled_output"),
    Dense(units=len(data_df['CPC_code'].unique()), name="cpc"), # logit layer with unit size equal to number of unique clsses
],
name="Baseline_Bidirectional_LSTM")

In [None]:
### ------- Set hyperparameters and input and output ------- ###
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('accuracy')

# compile the model
model.compile(loss=loss,
              optimizer=optimizer,
              metrics=metric)

In [None]:
### ------- Train ------- ###
history = model.fit(
    train,
    epochs=10,
    validation_data=test)

Adapted from:
https://www.tensorflow.org/tutorials/text/text_classification_rnn