In [None]:
import requests
import os
from tqdm import tqdm
import time
import random
import json
from pathlib import Path
import sys

In [6]:
src = str(Path.cwd().resolve().parents[1]/ 'src')
sys.path.append(src)
from config.paths import ROOT, DATA, EXPERIMENTS


In [7]:
data_folder = os.path.join(DATA, 'raw_collected', 'docs')

# Fonctions générales

In [None]:

def get_manifest(manifest_url) : 
    result = requests.get(manifest_url)
    if result.status_code != 200:
        print('Erreur')
        exit()

    manifest = result.json()
    return manifest

def get_list_images(manifest) : # BNF et INHA
    image_urls = []
    for canvas in manifest.get('sequences', [])[0].get('canvases', []):
        images = canvas.get('images', [])
        for image in images:
            resource = image.get('resource', {})
            if resource.get('@id'):
                image_urls.append(resource['@id'])
    return image_urls



def save_images(image_urls, doc_id, data_folder, timer) :
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
        }
    num_digits = len(str(len(image_urls)))  # 3 pour 100 images, 4 pour 1000, etc.
    print (doc_id, data_folder)
    target_dir = os.path.join(data_folder, doc_id)
    os.makedirs(target_dir, exist_ok=True)
    
    for id, url in enumerate(tqdm(image_urls)):
        try:
            img_data = requests.get(url).content
            filename = f"{doc_id}_{id + 1:0{num_digits}}.jpg"
            output_file = os.path.join(target_dir, filename)
            with open(output_file, "wb") as img_file:
                img_file.write(img_data)
            
            time.sleep(random.uniform(timer, timer+5))
            print(f"Page {id + 1}/{len(image_urls)}")
        except Exception as e:
            print(f"Error downloading image {id + 1}: {e}")


def main(manifest_url, doc_id, data_folder, timer) :
    manifest = get_manifest(manifest_url)
    image_urls = get_list_images(manifest)
    save_images(image_urls, doc_id, data_folder, timer)


In [None]:
def save_img (url_img, output_dir, object_id) :

    folder_name = os.path.basename(output_dir)
    img_data = requests.get(url_img).content
    img_dir = 'img'
    img_dir_path = os.path.join(output_dir, img_dir)
    os.makedirs(img_dir_path, exist_ok=True) 

    output_file = os.path.join(img_dir_path, f"{folder_name}_{object_id}.jpg")    
    with open(output_file, 'wb') as f : 
        f.write(img_data)
    return output_file

In [None]:
# En cas de téléchargement interrompu
def save_images_resume(image_urls, doc_id, data_folder, timer, resume_id) :
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
        }
    num_digits = len(str(len(image_urls)))  # 3 pour 100 images, 4 pour 1000, etc.
    print (doc_id, data_folder)
    target_dir = os.path.join(data_folder, doc_id)
    os.makedirs(target_dir, exist_ok=True)
    
    for id, url in enumerate(tqdm(image_urls)):
        if id > resume_id : 
            try:
                img_data = requests.get(url).content
                filename = f"{doc_id}_{id + 1:0{num_digits}}.jpg"
                output_file = os.path.join(target_dir, filename)
                with open(output_file, "wb") as img_file:
                    img_file.write(img_data)
                
                time.sleep(timer)
                print(f"Page {id + 1}/{len(image_urls)}")
            except Exception as e:
                print(f"Error downloading image {id + 1}: {e}")


def main_resume(manifest_url, doc_id, data_folder, timer, resume_id) :
    manifest = get_manifest(manifest_url)
    image_urls = get_list_images(manifest)
    save_images_resume(image_urls, doc_id, data_folder, timer, resume_id=resume_id)

In [None]:
# #BNF
# manifest_url = 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k96921155/manifest.json'
# timer = 15
# doc_id = 'Jacquemart_1866'
# main(manifest_url, doc_id, data_folder, timer)

In [None]:
# #BNF
# manifest_url = 'https://gallica.bnf.fr/iiif//ark:/12148/bpt6k6539348w/manifest.json'
# doc_id = 'Ziegler_1850'
# timer = 15
# main(manifest_url, doc_id, data_folder, timer)

### INHA

In [None]:
# manifest_url = 'https://bibliotheque-numerique.inha.fr/iiif/4994/manifest'	
# doc_id = 'Blanc_1882'

# timer = 10
# main(manifest_url, doc_id, data_folder=data_folder, timer=10)

# Internet Archive

In [None]:
def get_list_images_IA(manifest) :
    list_img_url = []
    list_items = manifest.get('items')
    for item in list_items : 
        img_url = (item.get('items',[])[0]
                .get('items', [])[0]
                .get('body', {})
                .get('id')
                )
        list_img_url.append(img_url)

    return list_img_url

In [None]:
timer=10
save_images_resume(liste, 'Soldi_1881', data_folder, timer, resume_id = 225)

In [None]:
#IA
manifest_url = 'https://iiif.archive.org/iiif/3/descriptivecatal00fort/manifest.json'
doc_id = 'Drury-Fortnum_1873'
manifest=get_manifest(manifest_url)
liste = get_list_images_IA(manifest)
save_images(liste, doc_id, data_folder, timer)

In [None]:
# #IA
# manifest_url = 'https://iiif.archive.org/iiif/3/gri_33125015600774/manifest.json'
# doc_id = 'DupontAuberville_1870'
# manifest=get_manifest(manifest_url)
# liste = get_list_images_IA(manifest)


# timer=10
# data_folder = '/Users/UserE/data/HN_Memoire/corpus_data/docs'
# save_images_resume(liste, doc_id, data_folder, timer, resume_id = 445)




In [None]:
#IA
# manifest_url = 'https://iiif.archive.org/iiif/3/albumofnineteent00unse/manifest.json'
# doc_id = 'Dresser_1862'
# manifest=get_manifest(manifest_url)
# liste = get_list_images_IA(manifest)
# save_images(liste, doc_id, data_folder, timer)

In [None]:
#IA
manifest_url = 'https://iiif.archive.org/iiif/3/S0001966/manifest.json'
doc_id = 'Redgrave_1876'
manifest=get_manifest(manifest_url)
liste = get_list_images_IA(manifest)
save_images_resume(liste, doc_id, data_folder, timer, 28)

In [None]:
#IA
manifest_url = 'https://iiif.archive.org/iiif/3/gri_33125008545499/manifest.json'
doc_id = 'Murphy-Horne_1815'
manifest=get_manifest(manifest_url)
liste = get_list_images_IA(manifest)
save_images_resume(liste, doc_id, data_folder, timer, resume_id = 152)


# MAD

In [None]:
def get_list_images_mad(url, doc_id, nb_page, num_digits) :
    list_img_url = []

    for i in range(nb_page) : 
        page_id = f"{i+1:0{num_digits}}"
        #img_url = f"{url}{doc_id}/{doc_id}_{page_id}_D.jpg"
        img_url = f"{url}/{doc_id[:-1].upper()}/LEC/{doc_id}{page_id}_l.jpg"
    
        list_img_url.append(img_url)
        
    return list_img_url

In [None]:
def save_images_mad(image_urls, doc_id, data_folder, timer) :
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
        }

    num_digits = len(str(len(image_urls)))  # 3 pour 100 images, 4 pour 1000, etc.
    print (doc_id, data_folder)
    target_dir = os.path.join(data_folder, doc_id)
    os.makedirs(target_dir, exist_ok=True)
    for id, url in enumerate(tqdm(image_urls)):
        try:
            img_data = requests.get(url, headers=headers).content
            filename = f"{doc_id}_{id + 1:0{num_digits}}.jpg"
            output_file = os.path.join(target_dir, filename)
            with open(output_file, "wb") as img_file:
                img_file.write(img_data)
            
            time.sleep(timer)
            print(f"Page {id + 1}/{len(image_urls)}")
        except Exception as e:
            print(f"Error downloading image {id + 1}: {e}")


In [None]:
list_images = get_list_images_mad('https://img01.decalog.net/decalog_images/numerisation_maciet', 'm5053ma_232x01x', 71, 4)
doc_id = 'Maciet_232-1'
timer = 15
data_folder = '/Users/UserE/data/HN_Memoire/corpus_data/docs'
save_images_mad(list_images, doc_id, data_folder, timer)

In [None]:
doc_id = 'Racinet_1885'
timer = 15
data_folder = '/Users/UserE/data/HN_Memoire/corpus_data/docs'
save_images_mad(list_images, doc_id, data_folder, timer)

# Qatar library

In [None]:
with open('/Users/UserE/Desktop/temp/manifest_qatar.js', encoding='UTF-8') as f :
    manifest = json.load(f)
manifest 
debut = 'https://iiif.qdl.qa/iiif/images/qnlhc/14760/4940_Page_001.jp2/full/,1200/0/default.jpg'
fin = 'https://iiif.qdl.qa/iiif/images/qnlhc/14760/4940_Page_150.jp2/full/,1200/0/default.jpg'
https://iiif.qdl.qa/iiif/images/qnlhc/14760/4940_Page_115.jp2/0,0,1000,1000/1000,/0/default.jpg

manifest_loc = 'https://www.loc.gov/item/2021666969/manifest.json'
manifest_loc = get_manifest('https://www.loc.gov/item/2021666969/manifest.json')
get_list_images(manifest_loc)
liste_liens = []
for i in range(150) : 
    lien = f'https://tile.loc.gov/image-services/iiif/service:gdc:gdcwdl:wd:l_:14:76:0_:00:2:wdl_14760_002:4940_Page_{i+1:03}/full/pct:50/0/default.jpg'
    liste_liens.append(lien)

timer = 15
data_folder = '/Users/UserE/data/HN_Memoire/corpus_data/docs'
save_images(liste_liens, doc_id='PrissedAvennes_1877B', data_folder=data_folder, timer=timer)