### Lightweight Notebook to Quickly Retrieve All FOIA PDFs from NRO site
This will retrieve all the FOIA PDFs in the CAL Library Listing on NROs website. These are documents related to CORONA, ARGON, and LANYARD.

In [None]:
import bs4
from datetime import datetime
from pathlib import Path 
import requests

In [None]:
output_folder = Path.cwd().parent.parent.parent / 'processing' / 'nro_declassified' / 'pdfs'
base_url = 'https://www.nro.gov'
r = requests.get(f'{base_url}/foia-home/foia-declassified-major-nro-programs-and-projects/CAL-Library-Listing/')

In [None]:
soup = bs4.BeautifulSoup(r.text,'lxml')
rows = soup.find_all('tr') # find the table rows

In [None]:
for row in rows:
    cells = row.find_all('td') 
    if len(cells) == 7: # there are other tables on the page but ours is 7 cells wide
        id = cells[0].text
        link = cells[1].find('a')
        if link is not None:
            link = link.get('href') 
            ext = link.split('.')[-1]
        name = cells[2].text
        for char in '<>:"/\|?*,.-': # not allowed in a windows file name
            name = name.replace(char, '') 
        name = name[:60] # low value bcuz there is also a windows length concern
        date = cells[3].text
        date = date.replace('(Estimated)', '') # text randomly included with date
        date = date.strip()
        try:
            date = datetime.strptime(date, '%m/%d/%Y').date().isoformat()
        except:
            date = datetime.utcnow().date().isoformat() # junk 
        if link is not None:
            file_name = f'{date}{name}_{id}.{ext}'
            r = requests.get(f'{base_url}{link}')
            if r.ok:
                with open (output_folder / file_name, 'wb') as f:
                    f.write(r.content)
            else:
                print(id)

In [None]:
have = [int(str(file).replace('.pdf', '').split('_')[-1]) for file in pdfs]
want = range(1, 2359, 1)
missing = [str(i) for i in want if i not in have]
print(f'The following IDs do not have associated PDFs downloaded in this file system: {(",").join(missing)}')

I manually reviewed the missing records and they are broken links on NROs site as well. I e-mailed NRO to inform them of the broken links and site functionality.

In [None]:
from pathlib import Path