In [1]:
import pandas as pd

labels = pd.read_csv("ex1_labels.csv", index_col=0)
features = pd.read_csv("../../Utilities/statistics.csv", index_col=0)
patent_data = pd.concat([features, labels], axis=1)
patent_data

Unnamed: 0,path,claim,description,abstract,citation,labeled,level1labels
AP1605A,E:\MLData\thesis\Datasets\LexisNexis\AP1605A.xml,False,False,True,True,False,
AP1665A,E:\MLData\thesis\Datasets\LexisNexis\AP1665A.xml,False,False,True,True,True,Skin care
AP1682A,E:\MLData\thesis\Datasets\LexisNexis\AP1682A.xml,False,False,True,True,True,
AP1904A,E:\MLData\thesis\Datasets\LexisNexis\AP1904A.xml,False,False,True,True,True,Hair care
AP1937A,E:\MLData\thesis\Datasets\LexisNexis\AP1937A.xml,False,False,True,True,True,Packaging
...,...,...,...,...,...,...,...
YU75202A,E:\MLData\thesis\Datasets\LexisNexis\YU75202A.xml,False,False,True,False,True,Packaging
YU82803A,E:\MLData\thesis\Datasets\LexisNexis\YU82803A.xml,False,False,True,False,True,Packaging
YU86802A,E:\MLData\thesis\Datasets\LexisNexis\YU86802A.xml,False,False,True,False,True,Packaging
YU86902A,E:\MLData\thesis\Datasets\LexisNexis\YU86902A.xml,False,False,True,False,True,Packaging


In [2]:
patent_data = patent_data[patent_data["level1labels"].notna()] # drop unlabeled patents
patent_data = patent_data[patent_data["abstract"] == 1] # drop patents that don't contain an abstract
print(f"Number of examples: {labels.size}")
print(patent_data["level1labels"].value_counts())

Number of examples: 220709
Skin care                       35016
Packaging                       25565
Health care                     24507
Hair care                       24115
Cleansing                       11833
Sun                              9177
Perfume                          6703
Deo                              4734
Non woven                        3363
Decorative cosmetic              2017
Lip care                         1900
Manufacturing technology         1890
Shaving                          1108
Sustainability                    665
Personalization                   162
Artificial Intelligence (AI)        6
Name: level1labels, dtype: int64


In [3]:
# drop AI because of the small number of instances
patent_data = patent_data[patent_data["level1labels"] != "Artificial Intelligence (AI)"]

In [4]:
# convert labels to categorical and create integer codes
patent_data["level1labels"] = pd.Categorical(patent_data["level1labels"])
patent_data["level1codes"] = patent_data["level1labels"].cat.codes
patent_data

Unnamed: 0,path,claim,description,abstract,citation,labeled,level1labels,level1codes
AP1665A,E:\MLData\thesis\Datasets\LexisNexis\AP1665A.xml,False,False,True,True,True,Skin care,12
AP1904A,E:\MLData\thesis\Datasets\LexisNexis\AP1904A.xml,False,False,True,True,True,Hair care,3
AP1937A,E:\MLData\thesis\Datasets\LexisNexis\AP1937A.xml,False,False,True,True,True,Packaging,8
AP2011006030A0,E:\MLData\thesis\Datasets\LexisNexis\AP2011006...,False,False,True,False,True,Sun,13
AP2015008920A0,E:\MLData\thesis\Datasets\LexisNexis\AP2015008...,False,False,True,False,True,Health care,4
...,...,...,...,...,...,...,...,...
YU75202A,E:\MLData\thesis\Datasets\LexisNexis\YU75202A.xml,False,False,True,False,True,Packaging,8
YU82803A,E:\MLData\thesis\Datasets\LexisNexis\YU82803A.xml,False,False,True,False,True,Packaging,8
YU86802A,E:\MLData\thesis\Datasets\LexisNexis\YU86802A.xml,False,False,True,False,True,Packaging,8
YU86902A,E:\MLData\thesis\Datasets\LexisNexis\YU86902A.xml,False,False,True,False,True,Packaging,8


In [5]:
import pathlib
from to_tfrecord import TFRecordsConverter
from Utilities.directories import data

# Convert to tfRecord
if __name__ == '__main__':
    output_dir = pathlib.Path.joinpath(data, "1.Abstract-SingleClass")
    converter = TFRecordsConverter(patent_data, output_dir, 0.1, 0.1)
    converter.convert()

100%|██████████| 3/3 [1:19:31<00:00, 1590.66s/it]


Splitting train set into TFRecord shards...
Splitting test set into TFRecord shards...
Splitting validate set into TFRecord shards...
Number of training examples: 122203
Number of testing examples: 15276
Number of validation examples: 15276
TFRecord files saved to E:\MLData\thesis\Datasets\1.Abstract-SingleClass


In [44]:
# Load Huggingface transformer
from transformers import BertConfig, BertTokenizerFast

# Then what you need from tensorflow.keras
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from Utilities.directories import lexis_abstract
from tqdm import tqdm

train_files = tf.data.Dataset.list_files(file_pattern=str(lexis_abstract) + "\*train*.tfrec")
test_files = tf.data.Dataset.list_files(file_pattern=str(lexis_abstract) + "\*test*.tfrec")
validation_files = tf.data.Dataset.list_files(file_pattern=str(lexis_abstract) + "\*validate*.tfrec")

features = {
    'abstract': tf.io.FixedLenFeature([], tf.string),
    'label': tf.io.FixedLenFeature([], tf.int64)
}

test_dataset = tf.data.TFRecordDataset(filenames=test_files, compression_type="ZLIB")


def _parse_function(example_proto, features):
    return tf.io.parse_single_example(example_proto, features)

def select_data_from_record(record):
    x = record['abstract']
    y = record['label']
    return (x, y)

parsed_dataset = test_dataset.map(lambda record: _parse_function(record, features))
parsed_dataset = parsed_dataset.map(select_data_from_record)

for p in parsed_dataset:
    print(p)
    break


# https://colab.research.google.com/drive/1yWaLpCWImXZE2fPV0ZYDdWWI8f52__9A#scrollTo=BmMlC1i0COBW

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
(<tf.Tensor: shape=(), dtype=string, numpy=b'A device for applying a cosmetic or for adding finishing touches to makeup, the device comprising a heater member comprising a resistive track that is printed or that is made by etching a coating that is deposited on a substrate, the device comprising a temperature sensor that is disposed level with the heater member, the temperature sensor being in contact with the substrate, in particular in the form of a circuit that is printed or etched on the substrate.'>, <tf.Tensor: shape=(), dtype=int64, numpy=8>)
