# Collection Building Assignment: Extract Collection Data (Step 1)
## Extract Collection Data
### 1. Get collection list

In [1]:
import requests
import json
import csv

url = 'https://www.loc.gov/free-to-use/japanese-prints/'

In [2]:
# download json representation of the set
parameters = {'fo': 'json'}

def download_json(url):
    r = requests.get(url, params=parameters)
    print(r.status_code, r.url)

    with open('json-data.json', 'w', encoding='utf-8') as f:
        string_as_json = json.loads(r.text)
        f.write(json.dumps(string_as_json, indent=2))
        print('wrote',f.name)

download_json(url)

200 https://www.loc.gov/free-to-use/japanese-prints/?fo=json
wrote json-data.json


In [3]:
# examine json representation of the set
with open('json-data.json', 'r') as f:
    data = json.load(f)
    items = data['content']['set']['items']

    print('there are', len(items), 'items in the set')

    # save in list for csv
    fields = ['image', 'link', 'title']
    with open('ftu-japanese-prints-collection-list.csv', 'w', newline='', encoding='utf-8') as csv_f:
        writer = csv.DictWriter(csv_f, fieldnames=fields)
        writer.writeheader()
        writer.writerows(items)
        print('wrote', csv_f.name)

there are 29 items in the set
wrote ftu-japanese-prints-collection-list.csv


### 2. Harvest the metadata for each item in the collection

In [4]:
def get_metadata(items):
    
    base_url = 'https://www.loc.gov'
    counter = 0
    # request and download full item metadata for each item in the set
    for item in items:

        # get rest of metadata
        resource = item['link']

        # get resource ID
        resourceID = resource.split('/')[2]
        resourceID = resourceID.replace('.', '-')
    
        try:
            r = requests.get(base_url + resource, params={'fo': 'json'})
            print(counter, r.status_code, r.url)
            counter += 1

            string_as_json = json.loads(r.text)

            
            # download and save image (grab highest quality)
            image_url = string_as_json['item']['image_url'][len(string_as_json['item']['image_url']) - 1]
            image = requests.get(image_url).content
            image_file = open('loc-item-files/' + resourceID + '.jpg', 'wb')
            image_file.write(image)
            image_file.close()

            # write rest of metadata to json file
            file = open('loc-item-metadata/' + resourceID + '.json', 'w')
            file.write(r.text)
            file.close()
        except:
            print('could not get item', counter)

get_metadata(items)

could not get item 1
1 200 https://www.loc.gov/resource/jpd.00154/?fo=json
2 200 https://www.loc.gov/resource/jpd.02457/?fo=json
3 200 https://www.loc.gov/resource/cph.3g10434/?fo=json
4 200 https://www.loc.gov/resource/jpd.02608/?fo=json
5 200 https://www.loc.gov/resource/cph.3g10372/?fo=json
6 200 https://www.loc.gov/resource/cph.3g10521/?fo=json
7 200 https://www.loc.gov/resource/cph.3g08479/?fo=json
8 200 https://www.loc.gov/resource/cph.3g10539/?fo=json
9 200 https://www.loc.gov/resource/jpd.00046/?fo=json
10 200 https://www.loc.gov/resource/jpd.01307/?fo=json
11 200 https://www.loc.gov/resource/jpd.00507/?fo=json
12 200 https://www.loc.gov/resource/jpd.01317/?fo=json
13 200 https://www.loc.gov/resource/jpd.01581/?fo=json
14 200 https://www.loc.gov/resource/jpd.00181/?fo=json
15 200 https://www.loc.gov/resource/jpd.00139/?fo=json
16 200 https://www.loc.gov/resource/jpd.01271/?fo=json
17 200 https://www.loc.gov/resource/jpd.02467/?fo=json
18 200 https://www.loc.gov/resource/jpd.018