In [7]:
import os
import re
import json
import shutil
import textract
import datetime
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
import subprocess

In [8]:
""" 
Some of the code in this script is based on the following sources:
https://colab.research.google.com/drive/1RU0TysCxnoT1sjy3l__DGVdsPWLAG1wT#scrollTo=Yq09i0jnczdi
created by Alessandro Corvi
"""
def driversetup():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("lang=en")
    options.add_argument("start-maximized")
    options.add_argument("disable-infobars")
    options.add_argument('ignore-certificate-errors')
    options.add_argument('--ignore-ssl-errors=yes')
    options.add_argument("--disable-extensions")
    options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(options=options)
    return driver
def setup_directories(base_dir='pdfs'):
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    return base_dir

def getpage(url, pdf_dir, pdf_name):
    pdf_path = os.path.join(pdf_dir, pdf_name)

    if os.path.exists(pdf_path):
        print(f'\tFile skipped, already present. Filename: {pdf_name}')
        return pdf_path

    driver = driversetup()
    driver.get(url)

    try:
        filename = max([f for f in os.listdir('.') if f.endswith('.pdf')], key=os.path.getctime)
        shutil.move(filename, pdf_path)
    except Exception as e:
        print(f'File failed, refer to the following URL: {url}\nError: {str(e)}')
        driver.quit()
        return None

    driver.quit()
    return pdf_path

def main():
    output_dir = 'RootDirectoryOfPatentData'
    pdfs_dir = setup_directories(output_dir)

    claims = pd.read_excel('data/tree.xlsx')
    res = []
    for k in range(len(claims['EFSA Opinion Reference'])):
        i = claims['EFSA Opinion Reference'][k]
        claim = claims['Claim'][k]
        if pd.isna(i):
            continue
        for j in re.split(',', str(i)):
            args = re.split(';|:', j)
            if len(args) == 3:
                res.append({
                    'url': 'https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.' + args[0].strip() + '.' + args[2].strip(),
                    'claim': claim,
                    'index': k,
                    'args': args
                })

    for entry in tqdm(res):
        url = entry['url']
        claim = entry['claim']
        idx = entry['index']
        pdf_code = entry['args'][0].strip()+"_"+entry['args'][2].strip()
        
        # Directory for this specific PDF document
        pdf_dir = os.path.join(pdfs_dir, pdf_code)
        check = os.path.join(pdfs_dir,'Claim_Doc')
        if not os.path.exists(check):
            if not os.path.exists(pdf_dir):
                os.makedirs(pdf_dir)
        else:
            print(f"Folder for {pdf_code} already exists. Skipping PDF processing.")

        pdf_name = f'{pdf_code}.pdf'
        pdf_path = os.path.join(pdf_dir, pdf_name)
        
        if not os.path.exists(pdf_path):
            pdf_path = getpage(url, pdf_dir, pdf_name)

In [9]:
main()

  1%|          | 3/291 [00:15<22:44,  4.74s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2049
Error: max() arg is an empty sequence


 12%|█▏        | 35/291 [01:01<06:52,  1.61s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1764
Error: max() arg is an empty sequence


 15%|█▌        | 44/291 [01:16<07:51,  1.91s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2012.2713
Error: max() arg is an empty sequence


 18%|█▊        | 52/291 [01:33<07:19,  1.84s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1747
Error: max() arg is an empty sequence


 18%|█▊        | 53/291 [01:35<07:42,  1.94s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1211
Error: max() arg is an empty sequence


 19%|█▊        | 54/291 [01:37<07:29,  1.90s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1211
Error: max() arg is an empty sequence


 19%|█▉        | 55/291 [01:40<08:33,  2.17s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2079
Error: max() arg is an empty sequence


 22%|██▏       | 65/291 [01:52<04:31,  1.20s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1263
Error: max() arg is an empty sequence


 25%|██▌       | 74/291 [02:09<05:01,  1.39s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1486
Error: max() arg is an empty sequence


 27%|██▋       | 79/291 [02:21<06:28,  1.83s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1797
Error: max() arg is an empty sequence


 27%|██▋       | 80/291 [02:23<06:41,  1.90s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1213
Error: max() arg is an empty sequence


 28%|██▊       | 81/291 [02:25<06:56,  1.98s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1760
Error: max() arg is an empty sequence


 29%|██▉       | 84/291 [02:31<06:35,  1.91s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1760
Error: max() arg is an empty sequence


 31%|███       | 89/291 [02:39<05:33,  1.65s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1258
Error: max() arg is an empty sequence


 32%|███▏      | 92/291 [02:45<05:33,  1.68s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1464
Error: max() arg is an empty sequence


 33%|███▎      | 95/291 [02:50<05:37,  1.72s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1214
Error: max() arg is an empty sequence


 34%|███▎      | 98/291 [02:54<04:41,  1.46s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1214
Error: max() arg is an empty sequence


 38%|███▊      | 111/291 [03:05<02:11,  1.37it/s]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1236
Error: max() arg is an empty sequence


 39%|███▉      | 114/291 [03:12<03:29,  1.19s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1763
Error: max() arg is an empty sequence


 44%|████▎     | 127/291 [03:25<02:42,  1.01it/s]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1808
Error: max() arg is an empty sequence


 46%|████▌     | 134/291 [03:42<05:16,  2.02s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1745
Error: max() arg is an empty sequence


 46%|████▋     | 135/291 [03:44<05:19,  2.05s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2304
Error: max() arg is an empty sequence


 47%|████▋     | 136/291 [03:48<06:19,  2.45s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1224
Error: max() arg is an empty sequence


 47%|████▋     | 138/291 [03:54<06:34,  2.58s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1224
Error: max() arg is an empty sequence


 51%|█████     | 147/291 [04:03<02:57,  1.24s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1218
Error: max() arg is an empty sequence


 52%|█████▏    | 152/291 [04:11<03:08,  1.36s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1747
Error: max() arg is an empty sequence


 54%|█████▍    | 158/291 [04:19<02:44,  1.24s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1813
Error: max() arg is an empty sequence


 56%|█████▌    | 163/291 [04:25<02:35,  1.22s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1811
Error: max() arg is an empty sequence


 57%|█████▋    | 165/291 [04:28<02:27,  1.17s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1811
Error: max() arg is an empty sequence


 59%|█████▉    | 172/291 [04:46<04:41,  2.37s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2043
Error: max() arg is an empty sequence


 60%|██████    | 175/291 [04:53<04:08,  2.14s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1814
Error: max() arg is an empty sequence


 64%|██████▎   | 185/291 [05:08<02:46,  1.57s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1220
Error: max() arg is an empty sequence


 64%|██████▍   | 186/291 [05:09<02:50,  1.62s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1727
Error: max() arg is an empty sequence


 65%|██████▍   | 188/291 [05:15<03:23,  1.97s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1220
Error: max() arg is an empty sequence


 65%|██████▌   | 190/291 [05:17<02:41,  1.59s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1220
Error: max() arg is an empty sequence


 68%|██████▊   | 197/291 [05:30<02:45,  1.76s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2072
Error: max() arg is an empty sequence


 71%|███████   | 207/291 [05:46<02:10,  1.56s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1221
Error: max() arg is an empty sequence


 71%|███████▏  | 208/291 [05:48<02:13,  1.60s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1221
Error: max() arg is an empty sequence


 72%|███████▏  | 209/291 [05:50<02:15,  1.65s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1754
Error: max() arg is an empty sequence


 74%|███████▍  | 215/291 [05:59<01:42,  1.35s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2021
Error: max() arg is an empty sequence


 75%|███████▍  | 218/291 [06:06<02:08,  1.77s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.4114
Error: max() arg is an empty sequence


 75%|███████▌  | 219/291 [06:10<02:34,  2.15s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.4114
Error: max() arg is an empty sequence


 76%|███████▌  | 220/291 [06:11<02:24,  2.04s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.4114
Error: max() arg is an empty sequence


 77%|███████▋  | 223/291 [06:15<01:50,  1.63s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.4114
Error: max() arg is an empty sequence


 78%|███████▊  | 228/291 [06:23<01:42,  1.63s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1225
Error: max() arg is an empty sequence


 87%|████████▋ | 254/291 [06:36<00:16,  2.20it/s]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1227
Error: max() arg is an empty sequence


 89%|████████▉ | 260/291 [06:41<00:17,  1.76it/s]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1468
Error: max() arg is an empty sequence


 90%|█████████ | 262/291 [06:43<00:17,  1.63it/s]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1468
Error: max() arg is an empty sequence


 92%|█████████▏| 267/291 [06:52<00:25,  1.08s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2074
Error: max() arg is an empty sequence


 93%|█████████▎| 272/291 [07:00<00:24,  1.30s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1229
Error: max() arg is an empty sequence


100%|██████████| 291/291 [07:07<00:00,  1.47s/it]
