In [1]:
# Based on https://github.com/aydwi/usda-dl

import aiofiles
import aiohttp
import asyncio
import pandas as pd

import nest_asyncio # Needed to run on ipykernel: https://medium.com/@vyshali.enukonda/how-to-get-around-runtimeerror-this-event-loop-is-already-running-3f26f67e762e
nest_asyncio.apply()


from bs4 import BeautifulSoup

BASE_URL = "https://usdawatercolors.nal.usda.gov"
INIT_URL = (
    BASE_URL + """/pom/search.xhtml?start={}&searchText=&searchField=&sortField="""
)

MAX_IMAGES = 20 # TODO: Change to 7600 when ran for real.

In [2]:
def chunks(dl_links, n):
    for i in range(0, len(dl_links), n):
        yield dl_links[i : i + n]


def get_fruit_segment(page):
    soup = BeautifulSoup(page, "html.parser")
    for link in soup.find_all("a"):
        if link.get("href").startswith("/pom/catalog.xhtml?id="):
            if not link.get("href") in fruits:
                fruits.append(link.get("href"))


async def collection_helper(session, url):
    async with session.get(url) as response:
        return await response.text()


async def collect_fruits(page_segment):
    async with aiohttp.ClientSession() as session:
        page = await collection_helper(session, INIT_URL.format(page_segment))
        try:
            get_fruit_segment(page)
            print(f"Collected fruit links from page segment starting at: start={page_segment}")
        except Exception as e:
            print(e)


async def download_fruit_image(image_url):
    file_name = image_url.split("=")[1]
    async with aiohttp.ClientSession() as session:
        async with session.get(image_url) as response:
            if response.status == 200:
                f = await aiofiles.open("{}.jpg".format(file_name), mode="wb+")
                await f.write(await response.read())
                print(f"Downloaded image {file_name}.jpg")
                await f.close()
                


In [4]:
fruits = []
dl_links = []

loop = asyncio.get_event_loop()
page_segments = [i for i in range(0, MAX_IMAGES, 20)]

print("\n\n---Starting link collection---\n")

loop.run_until_complete(
    asyncio.gather(*[collect_fruits(args) for args in page_segments])
)

print("\n\n---Finished link collection---\n")
print(f"Found {len(fruits)} total fruits")

# Write the list 'fruits' to a file at this point if you'd like.
for fruit in fruits:
    dl_link = BASE_URL + fruit.split("&")[0]
    dl_link = dl_link.replace("catalog", "download")
    dl_links.append(dl_link)

# Download in asynchronous bursts of 100 images, for sanity. Tweak CHUNK_SIZE if you'd like, best performance at 7584.
CHUNK_SIZE = 100

dl_links_chunked = list(chunks(dl_links, CHUNK_SIZE))

print("\n\n---Starting image download, be patient---\n")

columns = ['pomid', 'artist', 'scientific name', 'common name', 'geographic origin', 
           'physical description', 'specimen', 'year', 'notes on original', 'date created'] # Rights column is the same accross entire dataset.
df = pd.DataFrame(columns=columns)


"""
TODO: Modify script to create dataframe to save image metadata with correct columns. End result will spit out a CSV.

For each ID:
    Grab Image
    Grab Metadata, add into dataframe

"""



for burst in dl_links_chunked:
    # Add a little delay here to be gentler on the server.
    loop.run_until_complete(
        asyncio.gather(*[download_fruit_image(args) for args in burst]) # TODO: Switch to pass ID to hit text as well.
    )



---Starting link collection---

Collected fruit links from page segment starting at: start=0


---Finished link collection---

Found 20 total fruits


---Starting image download, be patient---



In [5]:
df

Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created


In [None]:
df.to_csv('usda_pomological.csv')