### Code to download Images from Sanborn Maps collection
#### This code is adapted from the official Library of Congress Github Pages found [here](https://github.com/LibraryOfCongress/data-exploration/blob/master/maps/maps-downloading-querying.ipynb) and [here](https://github.com/LibraryOfCongress/data-exploration/blob/master/Accessing%20images%20for%20analysis.ipynb)


In [1]:
import requests
import time
import pandas as pd
pd.set_option('max_colwidth', 600)

import os
import pprint
import re
import json

from IPython.core.display import HTML, display

In [17]:
def getURLID(url: str() , url_list = []) -> list:
    
    '''
    This function is used to retrieve image urls to be downloaded by the user    
    '''
    
    # Image retrieval parameters
    params = {"fo": "json", "c": 100, "at": "results,pagination"}
    
    call = requests.get(url , params = params)
    
    # Convert to json
    json_data = call.json()
    
    # Search for location in the url
    loc = re.search(r'\b(location:)\b', url)
    
    # Get range of dates
    dt_idx_end = re.search(r'\b(dates=)\b' , url)
    
    
    # get index of location
    a = re.search(r'\b(location:)\b' , url)
    
    location = ' '.join(url[a.end() : ].split('+'))
    
    # range_dates = '-'.join(url[dt_idx_end.end() : dt_idx_end.end() + 9].split('/'))
    
    print('Current Location: ' , location.title())
    
    # print('Date range: ' , range_dates)
    
    print()

    print('Current page:')
    print(json_data['pagination']['current'])

    print('\nPath to request the next page:')
    print(json_data['pagination']['next'])

    print('\nTotal number of results:')
    print(json_data['pagination']['of'])

    print('\nTotal number of results per page:')
    print(json_data['pagination']['perpage'] )

    print('\nTotal number of pages:')
    print(json_data['pagination']['total'])
    
    # pprint.pprint(json_data)
    # pprint.pprint(json_data['results'])
    
    
    
    results = json_data['results']
    
    for result in results:
        
        # Filter for specific content type
        if result.get('original_format') not in ['collection' , 'web page']:
            #print(result.get("original_format"))
            
            if result.get('id'):
                
                item = result.get('id')
                
                if item.startswith("http://www.loc.gov/item"):
                    
                    #print(result.get("id"))
                    url_list.append(item)
    
    
    if json_data["pagination"]["next"] is not None: 
        next_url = json_data["pagination"]["next"]
        getURLID(next_url, url_list)

    
    
    return url_list


    

In [21]:
image_url = 'https://www.loc.gov/collections/sanborn-maps/?dates=1900/1999&fa=location:rhode+island'
url_list = getURLID(image_url , [])

print()
print('Lengh of images: ' , len(url_list))

print('Display URLs ' , *url_list[0 : 5] , sep = '\n')


Current Location:  Rhode Island

Current page:
1

Path to request the next page:
None

Total number of results:
93

Total number of results per page:
100

Total number of pages:
1

Lengh of images:  93
Display URLs 
http://www.loc.gov/item/sanborn03692_006/
http://www.loc.gov/item/sanborn08075_001/
http://www.loc.gov/item/sanborn08077_004/
http://www.loc.gov/item/sanborn08077_005/
http://www.loc.gov/item/sanborn08077_006/


In [4]:
def getImageURL(url_ids: list , file_ext: str , item_url = []):
    
    '''
    
    This function is used to retrieve image url from ids 
    
    '''
    
    
    # Check file extension
    if file_ext == 'jpg':
        mimetype = 'jpeg'
    
    params = {"fo": "json"}
    
    for item in url_ids:
        
        call = requests.get(item , params = params)
        
        # Check for URL status
        if call.status_code == 200:
            json_data = call.json()      
        
        elif call.status_code == 429:
            print('Too many requests to API. Stopping early.')
        
            break
        
        else:
            try:
                time.sleep(15)
                call = requests.get(item, params=params)
                json_data = call.json()
            except:
                print('Skipping: '+ item)
                continue  
        
        resources = json_data['resources']
        
        #pprint.pprint(json_data)
        
        for r_idx , resource in enumerate(resources):
            
            #print(r_idx , '  ' , resource)
            
            resource_url = json_data['item']['resources'][r_idx]['url']
            
            for img_idx , file in enumerate(resource['files']):
                
                # Save content in a dataframe and select based on given file extension
                
                image_df = pd.DataFrame(file)
                
                # Select only jpeg file formats
                jpeg_image_df = image_df[image_df['mimetype'] == 'image/jpeg']
                
                
                
                try:
                    # Get last row
                    last_url_row = jpeg_image_df.iloc[-1]['url']
                    file_info = {}
                    file_info['image_url'] = last_url_row
                    file_info['item_id'] = item
                    item_url.append(file_info)
                
                except:
                    
                    print('Note: No ' + mimetype + 
                          ' files found in '+ 
                          resource_url + '?sp=' + str(index+1))                
                
                
                                                     
                                                     
        
        time.sleep(2)    
        
    # print(jpeg_image_df.iloc[-1]['url'])

    print('\nFound '+ str(len(url_ids)) + ' items')
    print('Found ' + str(len(item_url)) + ' files to download')
    return item_url
        
        
        
        
    

In [23]:
item_url = getImageURL(url_list[0 : 10] , '.jpg' , [])


Found 10 items
Found 217 files to download


In [14]:
def downloadImages(img_url: list , input_path: str) -> None:
    
    '''
    Function to download images from the provided url
    
    '''
    
    # Change list to dataframe
    img_url_df = pd.DataFrame(img_url)
    
    print(img_url_df.columns)
    for i in range(0, len(img_url_df)):
        
        img_link = img_url_df.iloc[i]['image_url']
        img_id = img_url_df.iloc[i]['item_id']
        
        
        #print('Downloading.... {}'.format(img_link))
        
        try:
            
            # Check if input save folder exists, if not create it
            if not os.path.isdir(input_path):
                print('Directory not found!, creating input directory..')
                os.makedirs(input_path)
            
            
            if 'image-services/iiif' in img_link:
                
                url_parts = img_link.split('/')
                
                # print(img_link , ' ' , url_parts)
                
                regex = re.compile("service:.*")
                service_list = list(filter(regex.match, url_parts))
                
                
                # print(service_list)
                
                # extract the last numeric digits for filename
                file_name = service_list[0].split(':')[-1]
                
                # print(file_name)
                
                file_ext = img_link.split('.')[-1]
                
                file_name = file_name + '.' + file_ext
            
            else:
                file_name = img_link.split('/')[-1]
            
            
            save_path = os.path.join(input_path, file_name) 
            print('Saving as: ' + save_path)
            #request the image and write to path
            image_response = requests.get(img_link, stream = True)
            
            with open(save_path, 'wb') as fd:
                for chunk in image_response.iter_content(chunk_size = 100000):
                    fd.write(chunk)            
                
                
                
                
                
                
        
        except ConnectionError as e:
            print(e)
    
    
    

In [None]:
input_path = (os.getcwd() + '\input')

downloadImages(item_url , input_path)