### This Python 3 notebook extracts images of a Gallica document (using the IIIF protocol), and then applies object detection to the images
1. Extract the document technical image metadata from its IIIF manifest,
2. Load the IIIF images,
3. Apply a yolo model.


In [2]:
import sys
import cv2
import os, fnmatch
from collections import namedtuple
import csv
import time

# insert here the Gallica document ID you want to process
docID = '12148/bpt6k46000341' # quotidien
#docID = '12148/btv1b6931954n' # photo
#docID = '12148/btv1b10336854c' # album
#docID = '12148/btv1b10544068q' # estampe
#docID = '12148/bpt6k65414058' # Vogue magazine

# IIIF export factor (%)
doc_export_factor = 10
# get docMax first images from the document
doc_max = 2
# data export
output = "OUT_csv"
output_img = "OUT_img"

# minimum confidence score to keep the detections
min_confidence = 0.20
# threshold when applying non-maxima suppression
threshold = 0.30

print("Python version")
print (sys.version)

Python version
3.8.7 (v3.8.7:6503f05dd5, Dec 21 2020, 12:45:15) 
[Clang 6.0 (clang-600.0.57)]


In [3]:
########## CSV output #############
output_dir = os.path.realpath(output)
if not os.path.isdir(output_dir):
	print(f"\n  Creating .csv directory {output}...")
	os.mkdir(output_dir);

print (f"\n... CSV files will be saved to {output}")

########## Images output #############
output_img_dir = os.path.realpath(output_img)
if not os.path.isdir(output_img_dir):
	print(f"\n  Creating img directory {output_img}...")
	os.mkdir(output_img_dir);

print (f"\n... images files will be saved to {output_img}\n")


  Creating .csv directory OUT_csv...

... CSV files will be saved to OUT_csv

  Creating img directory OUT_img...

... images files will be saved to OUT_img



### 1. We first build the IIIF manifest URL and call the API

In [5]:
import requests

METADATA_BASEURL = 'https://gallica.bnf.fr/iiif/ark:/'
req_url = "".join([METADATA_BASEURL, docID, '/manifest.json'])
print ("... getting the IIIF manifest",req_url)
# we ask for the IIIF manifest. The call returns a dictionary
r = requests.get(req_url)
r.raise_for_status()
json_4img = r.json()
print (json_4img.keys())


... getting the IIIF manifest https://gallica.bnf.fr/iiif/ark:/12148/bpt6k46000341/manifest.json
dict_keys(['@id', 'label', 'attribution', 'license', 'logo', 'related', 'seeAlso', 'description', 'metadata', 'sequences', 'thumbnail', '@type', '@context'])


### 2. Now we load the images files thanks to the IIIF API Image protocol

In [6]:
from iiif_api import IIIF #  get the image files with the IIIF Image API (PyGallica package again)

# get the sequence of images metadata. It's a list
sequences = json_4img.get('sequences')
# get the canvases, first element of the list. Its a dict
canvases = sequences[0]
print (canvases.keys())

dict_keys(['canvases', 'label', '@type', '@id'])
