A notebook to construct the data to be annotated later on. In order to do so, it randomly samples a number of patents. Moreover, the notebook splits the selected patents into a test and train set. The train set is used in order to train a Detectron2 model and the test set is used for tests.

In [2]:
import sys
sys.path.insert(1, '../')
import pandas as pd
import os
import re
import json
from shutil import copytree
import random
from shutil import copyfile
from os import mkdir
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError
)
PATH_TO_TXTS = '/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded_txt (tesseract)'
PATH_TO_PDFS = '/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded'
PATH_TO_IMGS = '/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded_imgs'
PATH_TO_IMG_FOR_MODEL = '/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded_for_model'

In [152]:
#getting all paths to pdfs
pdfs = []
for path, dirs, files in os.walk(PATH_TO_PDFS):
    for f in files:
        if f.endswith('pdf'):
            pdfs.append('{}/{}'.format(path, f))

In [153]:
# convert pdfs to images
for pdf_file in pdfs:
    patent_nb = pdf_file.split('/')[-1].split('.')[0]
    imgs = convert_from_path(pdf_file)
    os.mkdir('/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded_imgs/{}'.format(patent_nb))
    for i in range(len(imgs)):
        imgs[i].save('/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded_imgs/{}/{}.png'.format(patent_nb, i), format='png')

In [154]:
#getting random sample which will be used for model
random.seed(42)
patents = [pdf_file.split('/')[-1].split('.')[0] for pdf_file in pdfs]
patent_sample = random.sample(patents, int(0.7 * len(patents)))

In [93]:
# #putting them in correct directory (to run ONCE)
# for patent_nb in patent_sample:
#     copytree('/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded_imgs/{}'.format(patent_nb), '/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded_for_model/all/{}'.format(patent_nb))

In [165]:
#getting all patent numbers which will be used for model
seeded_model_data = [i for i in os.listdir('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/all') if 'GB' in i]


In [166]:
def get_num_imgs(d):
    """
    function to get number of images for a given patent
    Args:
    d - the directory/name of patent
    """
    count = 0
    for path in os.listdir(d):
        if os.path.isfile(os.path.join(d, path)):
            count += 1
    return count

In [167]:
#getting the number of images for each patent
num_imgs = {d: get_num_imgs('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/all/{}'.format(d)) for d in seeded_model_data}


In [168]:
#getting the test and train set
import random
random.seed(42)
random.shuffle(seeded_model_data)
# random.shuffle(num_imgs)
test_size = int(sum(num_imgs.values()) * 0.3)
test_set = {}
acc = 0
test_set_sep = 0
#make sure that the test size is about 30%, for testing purposes, all images belonging 
#to a given patent are in one of either the test set or the train set
while acc < test_size:
    acc += num_imgs[seeded_model_data[test_set_sep]]
    test_set_sep += 1

In [169]:
test_set_sep

556

In [170]:
#doing the separation
test_set = seeded_model_data[:test_set_sep]
train_set = seeded_model_data[test_set_sep:]

In [171]:
#number of images in train set
sum([num_imgs[i] for i in seeded_model_data[test_set_sep:]])

6432

In [172]:
#number of images in test set
sum([num_imgs[i] for i in seeded_model_data[:test_set_sep]])

2765

In [173]:
#number of patents in test set, number of patents in train set, total number of patents used
len(test_set), len(train_set),len(test_set) + len(train_set) 

(556, 1124, 1680)

In [174]:
#copying the test patents in the test folder
for patent_nb in test_set:
    copytree('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/all/{}'.format(patent_nb), '/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/new_test/{}'.format(patent_nb))
    copytree('/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/all/{}'.format(patent_nb), '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_test/{}'.format(patent_nb))  

In [175]:
#copying the train patents in the train folder
for patent_nb in train_set:
    copytree('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/all/{}'.format(patent_nb), '/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/new_train/{}'.format(patent_nb))
    copytree('/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/all/{}'.format(patent_nb), '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/new_train/{}'.format(patent_nb))    

# DEPRECATED

In [14]:
# DEPRECATED (OLD WAY OF GETTING TEST AND TRAIN SPLIT, IT IS DONE IN NOTEBOOK "Dataset Creation for Detectron2.ipynb")

#getting old train/test sets (these mixed all patents i.e. images of given patent could be split between test and train set)
with open('/Volumes/Non-Backup_Files/GB-patents/json/local/coco/test_data_reduced.json', 'r') as json_file:
    test = json.load(json_file)
with open('/Volumes/Non-Backup_Files/GB-patents/json/local/coco/train_data_reduced.json', 'r') as json_file:
    train = json.load(json_file)

In [47]:
#examining the test and train sets as constructed in Dataset Creation for Detectron2.ipynb

test_patents = set([test['images'][i]['file_name'].split('/')[-2] for i in range(len(test['images']))])
train_patents = set([train['images'][i]['file_name'].split('/')[-2] for i in range(len(train['images']))])
test_patent_imgs = set(['/'.join(test['images'][i]['file_name'].split('/')[-2:]).replace('.png','') for i in range(len(test['images']))])
train_patents_imgs = set(['/'.join(train['images'][i]['file_name'].split('/')[-2:]).replace('.png','') for i in range(len(train['images']))])

In [49]:
len(train_patents_imgs) == int(0.7 * (len(train_patents_imgs) + len(test_patent_imgs)))

True

In [54]:
'GB188511990A' in test_patents

True

In [50]:
from shutil import copyfile
from os import mkdir

In [51]:
# CREATING PATENT FOLDERS
for patent_name in test_patents:
    mkdir('/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/test/{}'.format(patent_name))
    mkdir('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/test/{}'.format(patent_name))
    
for patent_name in train_patents:
    mkdir('/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/train/{}'.format(patent_name))
    mkdir('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/train/{}'.format(patent_name))

In [66]:
# COPYING ITEMS FOR TEST PATENTS
for patent_name in test_patent_imgs:
    try:
        copyfile('/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/all/{}.xml'.format(patent_name), '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/test/{}.xml'.format(patent_name))
    except Exception:
        pass
    copyfile('/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/all/{}.png'.format(patent_name), '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/test/{}.png'.format(patent_name))
    try:
        copyfile('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/all/{}.png'.format(patent_name), '/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/test/{}.png'.format(patent_name))
    except Exception:
        pass

In [68]:
# COPYING ITEMS FOR TRAIN PATENTS
for patent_name in train_patents_imgs:
    try:
        copyfile('/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/all/{}.xml'.format(patent_name), '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/train/{}.xml'.format(patent_name))
    except Exception:
        pass
    copyfile('/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/all/{}.png'.format(patent_name), '/Volumes/Non-Backup_Files/GB-patents/annotated_seeded_data_for_model/train/{}.png'.format(patent_name))
    try:
        copyfile('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/all/{}.png'.format(patent_name), '/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/train/{}.png'.format(patent_name))
    except Exception:
        pass

# SCRAP

In [None]:
# X_train, X_test, y_train, y_test 
#     = train_test_split(X, y, test_size=0.3, random_state=42)

# X_train, X_val, y_train, y_val 
#     = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [64]:
pdfs_df = pd.Series(pdfs)

In [7]:
txts = []
for path, dirs, files in os.walk(PATH_TO_TXTS):
    for f in files:
        if f.endswith('txt'):
            txts.append('{}/{}'.format(path, f))

In [8]:
txts[0]

'/Volumes/Non-Backup_Files/GB-patents/random_sample_seeded_txt (tesseract)/GB0725820A.txt'