In [1]:
import os
import re
import json
import shutil
import textract
import datetime
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
import subprocess

In [2]:
def driversetup():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("lang=en")
    options.add_argument("start-maximized")
    options.add_argument("disable-infobars")
    options.add_argument('ignore-certificate-errors')
    options.add_argument('--ignore-ssl-errors=yes')
    options.add_argument("--disable-extensions")
    options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(options=options)
    return driver
def setup_directories(base_dir='pdfs'):
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    return base_dir

def getpage(url, pdf_dir, pdf_name):
    pdf_path = os.path.join(pdf_dir, pdf_name)

    if os.path.exists(pdf_path):
        print(f'\tFile skipped, already present. Filename: {pdf_name}')
        return pdf_path

    driver = driversetup()
    driver.get(url)

    try:
        filename = max([f for f in os.listdir('.') if f.endswith('.pdf')], key=os.path.getctime)
        shutil.move(filename, pdf_path)
    except Exception as e:
        print(f'File failed, refer to the following URL: {url}\nError: {str(e)}')
        driver.quit()
        return None

    driver.quit()
    return pdf_path

def main():
    output_dir = '/EFSA_DOCUMENTATION'
    pdfs_dir = setup_directories(output_dir)

    claims = pd.read_excel('data/tree.xlsx')
    res = []
    for k in range(len(claims['EFSA Opinion Reference'])):
        i = claims['EFSA Opinion Reference'][k]
        claim = claims['Claim'][k]
        if pd.isna(i):
            continue
        for j in re.split(',', str(i)):
            args = re.split(';|:', j)
            if len(args) == 3:
                res.append({
                    'url': 'https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.' + args[0].strip() + '.' + args[2].strip(),
                    'claim': claim,
                    'index': k,
                    'args': args
                })

    for entry in tqdm(res):
        url = entry['url']
        claim = entry['claim']
        idx = entry['index']
        pdf_code = entry['args'][0].strip()+"_"+entry['args'][2].strip()
        
        # Directory for this specific PDF document
        pdf_dir = os.path.join(pdfs_dir, pdf_code)
        check = os.path.join(pdfs_dir,'Claim_Doc')
        if not os.path.exists(check):
            if not os.path.exists(pdf_dir):
                os.makedirs(pdf_dir)
        else:
            print(f"Folder for {pdf_code} already exists. Skipping PDF processing.")

        pdf_name = f'{pdf_code}.pdf'
        pdf_path = os.path.join(pdf_dir, pdf_name)
        
        if not os.path.exists(pdf_path):
            pdf_path = getpage(url, pdf_dir, pdf_name)

In [3]:
main()

  6%|▌         | 18/291 [00:35<04:21,  1.04it/s]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1210
Error: max() arg is an empty sequence


  9%|▊         | 25/291 [00:45<05:07,  1.16s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1272
Error: max() arg is an empty sequence


 18%|█▊        | 52/291 [01:39<08:44,  2.19s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1747
Error: max() arg is an empty sequence


 24%|██▍       | 70/291 [02:07<07:13,  1.96s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2078
Error: max() arg is an empty sequence


 31%|███       | 90/291 [02:38<04:20,  1.30s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1798
Error: max() arg is an empty sequence


 35%|███▍      | 101/291 [02:56<03:57,  1.25s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1215
Error: max() arg is an empty sequence


 40%|███▉      | 115/291 [03:20<05:48,  1.98s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.1807
Error: max() arg is an empty sequence


 47%|████▋     | 136/291 [04:27<16:58,  6.57s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2009.1224
Error: max() arg is an empty sequence


 74%|███████▍  | 215/291 [06:34<01:55,  1.51s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2021
Error: max() arg is an empty sequence


 75%|███████▍  | 218/291 [06:42<02:20,  1.93s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.4114
Error: max() arg is an empty sequence


 75%|███████▌  | 219/291 [06:46<02:40,  2.23s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.4114
Error: max() arg is an empty sequence


 76%|███████▌  | 220/291 [06:52<03:20,  2.82s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.4114
Error: max() arg is an empty sequence


 77%|███████▋  | 223/291 [06:56<02:29,  2.20s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2010.4114
Error: max() arg is an empty sequence


 92%|█████████▏| 268/291 [07:47<00:38,  1.65s/it]

File failed, refer to the following URL: https://efsa.onlinelibrary.wiley.com/doi/pdf/10.2903/j.efsa.2011.2075
Error: max() arg is an empty sequence


100%|██████████| 291/291 [08:01<00:00,  1.66s/it]
