In [48]:
# Based on https://github.com/aydwi/usda-dl

import aiofiles
import aiohttp
import asyncio
import pandas as pd

import requests

import nest_asyncio # Needed to run on ipykernel: https://medium.com/@vyshali.enukonda/how-to-get-around-runtimeerror-this-event-loop-is-already-running-3f26f67e762e
nest_asyncio.apply()


from bs4 import BeautifulSoup

BASE_URL = "https://usdawatercolors.nal.usda.gov"
INIT_URL = (
    BASE_URL + """/pom/search.xhtml?start={}&searchText=&searchField=&sortField="""
)

MAX_IMAGES = 7600 # TODO: Change to 7600 when ran for real.

In [49]:
def chunks(dl_links, n):
    for i in range(0, len(dl_links), n):
        yield dl_links[i : i + n]


def get_fruit_segment(page):
    soup = BeautifulSoup(page, "html.parser")
    for link in soup.find_all("a"):
        if link.get("href").startswith("/pom/catalog.xhtml?id="):
            if not link.get("href") in fruits:
                fruits.append(link.get("href"))


async def collection_helper(session, url):
    async with session.get(url) as response:
        return await response.text()


async def collect_fruits(page_segment):
    async with aiohttp.ClientSession() as session:
        page = await collection_helper(session, INIT_URL.format(page_segment))
        try:
            get_fruit_segment(page)
            print(f"Collected fruit links from page segment starting at: start={page_segment}")
        except Exception as e:
            print(e)


async def download_fruit_image(image_url):
    file_name = image_url.split("=")[1]
    async with aiohttp.ClientSession() as session:
        async with session.get(image_url) as response:
            if response.status == 200:
                f = await aiofiles.open("{}.jpg".format(file_name), mode="wb+")
                await f.write(await response.read())
                print(f"Downloaded image {file_name}.jpg")
                await f.close()
                
def download_fruit_meta(data_url):
    _id = data_url.split('=')[-1]
    url = f'https://usdawatercolors.nal.usda.gov/pom/catalog.xhtml?id={_id}'
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    labels = [str(tag.text).lower().replace(':','') for tag in soup.findAll('dt')]
    attributes = [str(tag.text).replace('\n', '') for tag in soup.findAll('dd')]
    meta_data = dict(zip(labels,attributes))
    meta_data['pomid'] = _id
    return meta_data
    
    
#     print([tag.text for tag in soup])
                
async def download_fruit(url):
    _id = url.split('=')
    meta = f'https://usdawatercolors.nal.usda.gov/pom/catalog.xhtml?id={_id}'
    download_fruit_meta(meta)

In [50]:
fruits = []
dl_links = []
meta_links = []

loop = asyncio.get_event_loop()
page_segments = [i for i in range(0, MAX_IMAGES, 20)]

print("\n\n---Starting link collection---\n")

loop.run_until_complete(
    asyncio.gather(*[collect_fruits(args) for args in page_segments])
)

print("\n\n---Finished link collection---\n")
print(f"Found {len(fruits)} total fruits")

# Write the list 'fruits' to a file at this point if you'd like.
for fruit in fruits:
    link = BASE_URL + fruit.split("&")[0]
    meta_links.append(link)
    dl_link = link.replace("catalog", "download")
    dl_links.append(dl_link)

# Download in asynchronous bursts of 100 images, for sanity. Tweak CHUNK_SIZE if you'd like, best performance at 7584.
CHUNK_SIZE = 100

dl_links_chunked = list(chunks(dl_links, CHUNK_SIZE))

print("\n\n---Starting image download, be patient---\n")

columns = ['pomid', 'artist', 'scientific name', 'common name', 'geographic origin', 
           'physical description', 'specimen', 'year', 'notes on original', 'date created'] # Rights column is the same accross entire dataset.
df = pd.DataFrame(columns=columns)


"""
TODO: Modify script to create dataframe to save image metadata with correct columns. End result will spit out a CSV.

For each ID:
    Grab Image
    Grab Metadata, add into dataframe

"""



for burst in dl_links_chunked:
    # Add a little delay here to be gentler on the server.
    for args in burst:
        df = df.append(download_fruit_meta(args), ignore_index=True)
#     loop.run_until_complete(
#         asyncio.gather(*[download_fruit_meta(args, df) for args in burst]) # TODO: Switch to pass ID to hit text as well.
#     )



---Starting link collection---

Collected fruit links from page segment starting at: start=540
Collected fruit links from page segment starting at: start=740
Collected fruit links from page segment starting at: start=380
Collected fruit links from page segment starting at: start=3560
Collected fruit links from page segment starting at: start=1720
Collected fruit links from page segment starting at: start=20
Collected fruit links from page segment starting at: start=240
Collected fruit links from page segment starting at: start=1440
Collected fruit links from page segment starting at: start=620
Collected fruit links from page segment starting at: start=1640
Collected fruit links from page segment starting at: start=4580
Collected fruit links from page segment starting at: start=1960
Collected fruit links from page segment starting at: start=2460
Collected fruit links from page segment starting at: start=4440
Collected fruit links from page segment starting at: start=3360
Collected fru

In [51]:
df

Unnamed: 0,pomid,artist,scientific name,common name,geographic origin,physical description,specimen,year,notes on original,date created,rights,variety,nal note
0,POM00006406,"Passmore, Deborah Griscom, 1840-1911",Citrus sinensis,oranges,"Duarte, Los Angeles County, California, United...",1 art original : col. ; 17 x 25 cm.,19473,,,,Use of the images in the U.S. Department of Ag...,Navelencia,
1,POM00006407,"Passmore, Deborah Griscom, 1840-1911",Citrus sinensis,oranges,"Riverside, Riverside County, California, Unite...",1 art original : col. ; 17 x 25 cm.,40440,1908,,1908,Use of the images in the U.S. Department of Ag...,Navelencia,
2,POM00006463,"Newton, Amanda Almira, ca. 1860-1943",Citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748,1914,,1914-03-13,Use of the images in the U.S. Department of Ag...,New,
3,POM00006465,"Newton, Amanda Almira, ca. 1860-1943",Citrus sinensis,oranges,"Honcut, Butte County, California, United States",1 art original : col. ; 17 x 25 cm.,70748a,1914,Peter Bisset,1914-03-16,Use of the images in the U.S. Department of Ag...,New,
4,POM00006446,"Schutt, Ellen Isham, 1873-1955",Citrus sinensis,oranges,,1 art original : col. ; 17 x 26 cm.,37438,1906,,1906-11-19,Use of the images in the U.S. Department of Ag...,No. 779,Watercolor includes mock up for the Yearbook o...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7579,POM00001050,"Newton, Amanda Almira, ca. 1860-1943",Malus domestica,apples,"Storm Lake, Buena Vista County, Iowa, United S...",1 art original : col. ; 16 x 25 cm.,40064,1908,,1908-04-01,Use of the images in the U.S. Department of Ag...,Anisim,
7580,POM00001051,"Passmore, Deborah Griscom, 1840-1911",Malus domestica,apples,"Rosslyn, Arlington County, Virginia, United St...",1 art original : col. ; 17 x 25 cm.,109640,1928,,1928-03-08,Use of the images in the U.S. Department of Ag...,Annette,
7581,POM00001052,"Heiges, Bertha",Malus domestica,apples,"Wilna, Harford County, Maryland, United States",1 art original : col. ; 17 x 25 cm.,33232,1905,,1905-01-25,Use of the images in the U.S. Department of Ag...,Annie Frank,
7582,POM00001053,"Arnold, Mary Daisy, ca. 1873-1955",Malus domestica,apples,"Rosslyn, Arlington County, Virginia, United St...",1 art original : col. ; 17 x 26 cm.,105989,1925,"Section F, Row 1-2, Tree 3",1925-01-21,Use of the images in the U.S. Department of Ag...,Annurco,


In [54]:
del df['rights']

In [56]:
df.to_csv('data/usda_pomological.csv', index=False)