In [1]:
import requests as rq
from PIL import Image
from io import BytesIO
import pandas as pd

In [2]:
#Read in file
cushman_df = pd.read_csv("cushman_encoded.csv")
# Warning is because pandas is having trouble assigning types to the data
# This is not a failure, just something we'll be looking at in the next section
# Alternative way of resolving error - add dtype="object" to set everything as string
print(cushman_df.columns)

Index(['PURL', 'IU Archives Number', 'Cushman number', 'Roll ID',
       'Roll number', 'Year', 'Frame number', 'Start Date', 'End Date',
       'Archive Date', 'Month Unknown', 'Day Unknown', 'Camera settings',
       'Asterisk', 'Description from Notebook', 'Description from Slide Mount',
       'Image Note', 'No Slide Exists', 'Slide Condition', 'ID',
       'Street Address', 'Street Address 2', 'Street', 'Street 2', 'Street 3',
       'Street 4', 'Street 5', 'Neighborhood', 'City', 'City 2', 'City 3',
       'City 4', 'County', 'County 2', 'County 3', 'County 4',
       'State/Province', 'State/Province 2', 'State/Province 3',
       'State/Province 4', 'Country', 'Country 2', 'Country 3', 'Country 4',
       'Personal Names 1', 'Personal Names 2', 'Personal Names 3',
       'Personal Names 4', 'Personal Names 5', 'Personal Names 6',
       'Personal Names 7', 'Personal Names 8', 'Corporate Names 1',
       'Corporate Names 2', 'Corporate Names 3', 'Corporate Names 4',
       'Corp

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#We want the Identifier
print(cushman_df['IU Archives Number'][0:5])

0    1411.0
1    1412.0
2    1413.0
3    1414.0
4    1415.0
Name: IU Archives Number, dtype: float64


In [4]:
#It shouldn't be a float
#Let's convert it to int
cushman_df['IU Archives Number'] = cushman_df['IU Archives Number'].astype('int')

ValueError: Cannot convert NA to integer

In [5]:
#We have NA values - let's ignore them
subset = cushman_df[pd.notnull(cushman_df['IU Archives Number'])].copy()

In [6]:
#Now we can 
print(subset['IU Archives Number'].dtype)
subset['IU Archives Number'] = subset['IU Archives Number'].astype('int')
print(subset['IU Archives Number'].dtype)

float64
int64


In [7]:
#Now we're going to get the images
base = 'http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P0'
#We use a loop to look through the first 10 rows
for i in range(10):
    # we get the record ID number
    number = subset['IU Archives Number'][i]
    # Translate it into a filename
    url = base + str(number) + '.jpg'
    print(url)
    # we use the response library to get the image in bytes
    response = rq.get(url)
    # we use the image library to translate the bytes to an image
    i = Image.open(BytesIO(response.content))
    # and we save the image
    i.save(str(number) + '.jpg')
    print(number)

http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01411.jpg
1411
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01412.jpg
1412
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01413.jpg
1413
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01414.jpg
1414
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01415.jpg
1415
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01416.jpg
1416
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01417.jpg
1417
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01418.jpg
1418
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01419.jpg
1419
http://purl.dlib.indiana.edu/iudl/archives/cushman/full/P01420.jpg
1420
