# Collection Building Assignment: Extract Collection Data (Step 1)
## Extract Collection Data
### 1. Get collection list

In [60]:
import requests
import json
import csv

url = 'https://www.loc.gov/free-to-use/libraries/'

In [26]:
# download json representation of the set
parameters = {'fo': 'json'}

def download_json(url):
    r = requests.get(url, params=parameters)
    print(r.status_code, r.url)

    with open('json-data.json', 'w', encoding='utf-8') as f:
        string_as_json = json.loads(r.text)
        f.write(json.dumps(string_as_json, indent=2))
        print('wrote',f.name)

download_json(url)

200 https://www.loc.gov/free-to-use/libraries/?fo=json
wrote json-data.json


In [27]:
# examine json representation of the set
with open('json-data.json', 'r') as f:
    data = json.load(f)
    items = data['content']['set']['items']

    print('there are', len(items), 'items in the set')

    # save in list for csv
    fields = ['image', 'link', 'title']
    with open('ftu-libraries-set-list.csv', 'w', newline='', encoding='utf-8') as csv_f:
        writer = csv.DictWriter(csv_f, fieldnames=fields)
        writer.writeheader()
        writer.writerows(items)
        print('wrote', csv_f.name)

there are 62 items in the set
wrote ftu-libraries-set-list.csv


### 2. Harvest the metadata for each item in the collection

In [None]:
def get_metadata(items):
    
    base_url = 'https://www.loc.gov'
    counter = 1
    # request and download full item metadata for each item in the set
    for item in items:

        # get rest of metadata
        resource = item['link']
    
        try:
            r = requests.get(base_url + resource, params={'fo': 'json'})
            print(counter, r.status_code, r.url)
            counter += 1

            string_as_json = json.loads(r.text)

            # get lccn number
            try:
                item_lccn = string_as_json['item']['library_of_congress_control_number']
            except:
                item_lccn = string_as_json['item']['control_number']
            
            # download and save image (grab highest quality)
            image_url = string_as_json['item']['image_url'][len(string_as_json['item']['image_url']) - 1]
            image = requests.get(image_url).content
            image_file = open('item-files/' + item_lccn + '.jpg', 'wb')
            image_file.write(image)
            image_file.close()

            # write rest of metadata to json file
            file = open('item-metadata/' + item_lccn + '.json', 'w')
            file.write(r.text)
            file.close()
        except:
            print('could not get item', counter)

get_metadata(items)

1 200 https://www.loc.gov/resource/cph.3f05183/?fo=json
https://tile.loc.gov/storage-services/service/pnp/cph/3f00000/3f05000/3f05100/3f05183v.jpg#h=1024&w=705
2 200 https://www.loc.gov/resource/highsm.20336/?fo=json
https://tile.loc.gov/image-services/iiif/service:pnp:highsm:20300:20336/full/pct:25/0/default.jpg#h=829&w=1540
3 200 https://www.loc.gov/resource/fsa.8d24709/?fo=json
https://tile.loc.gov/storage-services/service/pnp/fsa/8d24000/8d24700/8d24709v.jpg#h=1024&w=976
4 200 https://www.loc.gov/resource/highsm.36052/?fo=json
https://tile.loc.gov/image-services/iiif/service:pnp:highsm:36000:36052/full/pct:25/0/default.jpg#h=1448&w=2172
5 200 https://www.loc.gov/resource/highsm.51772/?fo=json
https://tile.loc.gov/storage-services/service/pnp/highsm/51700/51772v.jpg#h=570&w=1024
6 200 https://www.loc.gov/resource/cph.3b43255/?fo=json
https://tile.loc.gov/storage-services/service/pnp/cph/3b40000/3b43000/3b43200/3b43255r.jpg#h=422&w=640
7 200 https://www.loc.gov/resource/highsm.20483/