# Collection Building Assignment: Extract Collection Data (Step 1)
## Extract Collection Data
### 1. Get collection list

In [14]:
import requests
import json
import csv

url = 'https://www.loc.gov/free-to-use/dragons/'

In [15]:
# download json representation of the set
parameters = {'fo': 'json'}

def download_json(url):
    r = requests.get(url, params=parameters)
    print(r.status_code, r.url)

    with open('json-data.json', 'w', encoding='utf-8') as f:
        string_as_json = json.loads(r.text)
        f.write(json.dumps(string_as_json, indent=2))
        print('wrote',f.name)

download_json(url)

200 https://www.loc.gov/free-to-use/dragons/?fo=json
wrote json-data.json


In [16]:
# examine json representation of the set
with open('json-data.json', 'r') as f:
    data = json.load(f)
    items = data['content']['set']['items']

    print('there are', len(items), 'items in the set')

    # save in list for csv
    fields = ['alt', 'image', 'link', 'title']
    with open('ftu-dragons-collection-list.csv', 'w', newline='', encoding='utf-8') as csv_f:
        writer = csv.DictWriter(csv_f, fieldnames=fields)
        writer.writeheader()
        writer.writerows(items)
        print('wrote', csv_f.name)

there are 44 items in the set
wrote ftu-dragons-collection-list.csv


### 2. Harvest the metadata for each item in the collection

In [17]:
def get_metadata(items):
    
    base_url = 'https://www.loc.gov'
    counter = 1
    # request and download full item metadata for each item in the set
    for item in items:

        # get rest of metadata
        resource = item['link']

        # get resource ID
        resourceID = resource.split('/')[2]
        resourceID = resourceID.replace('.', '-')
    
        try:
            r = requests.get(base_url + resource, params={'fo': 'json'})
            print(counter, r.status_code, r.url)
            counter += 1

            string_as_json = json.loads(r.text)

            
            # download and save image (grab highest quality)
            image_url = string_as_json['item']['image_url'][len(string_as_json['item']['image_url']) - 1]
            image = requests.get(image_url).content
            image_file = open('item-files/' + resourceID + '.jpg', 'wb')
            image_file.write(image)
            image_file.close()

            # write rest of metadata to json file
            file = open('item-metadata/' + resourceID + '.json', 'w')
            file.write(r.text)
            file.close()
        except:
            print('could not get item', counter)

get_metadata(items)

1 200 https://www.loc.gov/resource/rbc0001.2011rosen0007/?sp=269&fo=json
2 404 https://www.loc.gov/resource/highsm.52549/?fo=json
could not get item 3
3 200 https://www.loc.gov/resource/music.musihas-200215965/?fo=json
4 404 https://www.loc.gov/resource/jpd.01469/?fo=json
could not get item 5
5 404 https://www.loc.gov/resource/ppmsca.42001/?fo=json
could not get item 6
6 404 https://www.loc.gov/resource/highsm.37303/?fo=json
could not get item 7
7 404 https://www.loc.gov/resource/cph.3b16717/?fo=json
could not get item 8
8 404 https://www.loc.gov/resource/ppmsca.30768/?fo=json
could not get item 9
9 404 https://www.loc.gov/resource/ggbain.16338/?fo=json
could not get item 10
10 404 https://www.loc.gov/resource/mrg.02997/?fo=json
could not get item 11
11 404 https://www.loc.gov/resource/ppmsca.06618/?fo=json
could not get item 12
12 404 https://www.loc.gov/resource/jpd.01939/?fo=json
could not get item 13
13 404 https://www.loc.gov/resource/highsm.47472/?fo=json
could not get item 14
14