In [1]:
import pandas as pd
import tqdm
import json

In [4]:
root_dir = '../../data/longformer/patentsview/'

In [5]:
# Load the data. Only 1000 examples in this case. Only patent description text, along with patent_id and length info.
patents = pd.read_csv(root_dir+'detail_desc_text_2019.tsv',sep="\t", nrows=1000, usecols=['patent_id', 'text', 'length'], dtype={'patent_id':object, 'text':object, 'length':int })

In [6]:
# Icpr file holds detailed class information about the patents. We will only investigate section column which consist of 8 distinct classes with single class for every patent.
ipcr = pd.read_csv(root_dir+'ipcr.tsv',sep="\t", usecols=['patent_id','section'], dtype={'patent_id':object,'section':object})

In [7]:
# Drop duplicates because this table might have duplicating patent_id sharing the same section with different class details.
unique_ipcr = ipcr.drop_duplicates(subset=['patent_id'])

In [8]:
# Merge two dataframes so patent description and its section information is united.
data = patents.merge(unique_ipcr, how='left', on='patent_id')

In [None]:
# Construct an integer representation for every letter section for transformer will require an integer value for label. This json file can be used to convert id2text or text2id.
sections = data['section'].unique()
sections.sort()
labels = {}
i=0
for section in sections:
    labels[section] = i
    i+=1
labels = json.dumps(labels)

with open(root_dir+"labels.json", "w") as f:
    f.write(labels)

labels = json.loads(labels)

In [None]:
# Replace the string values with integer representations.
data.replace({'section':labels}, inplace=True)

In [9]:
# Rename columns and save data as two partitions.
data.columns=["patent_id", "text", "legth", "label"]
data.loc[:799].to_csv(root_dir+'example/letter_train_example.csv', sep=",", index=False)
data.loc[800:].to_csv(root_dir+'example/letter_test_example.csv', sep=",", index=False)