In [None]:
import json
import re
import requests
import zipfile
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import time
from pathlib import Path

print_lock = Lock()

def safe_print(*args, **kwargs):
    """Thread-safe print function."""
    with print_lock:
        print(*args, **kwargs)

def download_txt_and_get_filename(url, timeout=30):
    """
    Download TXT file and extract the filename from response.
    Returns: (content, filename) or (None, None) on error
    """
    try:
        response = requests.get(url, timeout=timeout, stream=True)
        response.raise_for_status()
        
        # Get filename from Content-Disposition header
        filename = None
        if 'Content-Disposition' in response.headers:
            content_disposition = response.headers['Content-Disposition']
            # Parse filename from header like: attachment; filename="21424.txt"
            match = re.search(r'filename="?([^"]+)"?', content_disposition)
            if match:
                filename = match.group(1)
        
        # If no Content-Disposition, try to extract from URL or response
        if not filename:
            # Sometimes filename might be in the URL path
            filename = url.split('/')[-1]
            if not filename.endswith('.txt'):
                filename = None
        
        # Get content
        content = response.content.decode('utf-8')
        
        return content, filename
        
    except requests.exceptions.RequestException as e:
        safe_print(f"⚠️ Error downloading from {url}: {e}")
        return None, None
    except UnicodeDecodeError as e:
        safe_print(f"⚠️ Error decoding content from {url}: {e}")
        return None, None

def extract_number_from_filename(filename):
    """Extract the file ID number from the filename."""
    if not filename:
        return None
    
    # Remove .txt extension and extract number
    match = re.search(r'(\d+)\.txt$', filename)
    if match:
        return match.group(1)
    
    # Try to find any number in the filename
    match = re.search(r'(\d+)', filename)
    if match:
        return match.group(1)
    
    return None

def download_json_file(url, timeout=30):
    """Download JSON file from URL."""
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        safe_print(f"⚠️ Error downloading JSON from {url}: {e}")
        return None

def parse_transcription_txt(txt_content):
    """Parse the transcription TXT file and return a dictionary."""
    transcription_dict = {}
    
    pattern = r'#\+\s*---\s*\n#\+\s*(NL-HaNA_1\.04\.02_\d+_\d+)\.xml\s*\n#\+\s*---'
    matches = list(re.finditer(pattern, txt_content))
    
    for i, match in enumerate(matches):
        current_id = match.group(1)
        text_start = match.end()
        
        if i + 1 < len(matches):
            text_end = matches[i + 1].start()
        else:
            text_end = len(txt_content)
        
        text_content = txt_content[text_start:text_end].strip()
        transcription_dict[current_id] = text_content
    
    return transcription_dict

def merge_transcription_into_manifest(manifest_data, transcription_dict):
    """Merge transcription text into the manifest canvases."""
    found_matches = 0
    
    if 'items' in manifest_data and isinstance(manifest_data['items'], list):
        for canvas in manifest_data['items']:
            canvas_label = None
            
            if 'label' in canvas and 'en' in canvas['label'] and canvas['label']['en']:
                canvas_label = canvas['label']['en'][0].strip()
            
            if canvas_label and canvas_label in transcription_dict:
                canvas['text'] = transcription_dict[canvas_label]
                found_matches += 1
            else:
                canvas['text'] = ""
    
    return found_matches

def process_single_file(txt_url, json_url_template, output_dir):
    """
    Process a single file:
    1. Download TXT file and get its filename
    2. Extract number from filename
    3. Download corresponding JSON using that number
    4. Merge and save
    """
    result = {
        'txt_url': txt_url,
        'file_id': None,
        'success': False,
        'manifest': None,
        'error': None
    }
    
    try:
        # Download TXT file and get filename
        txt_content, txt_filename = download_txt_and_get_filename(txt_url)
        
        if not txt_content or not txt_filename:
            result['error'] = f"Failed to download TXT or get filename"
            safe_print(f"⚠️ Failed: {txt_url} - could not download or get filename")
            return result
        
        # Extract file ID from filename
        file_id = extract_number_from_filename(txt_filename)
        
        if not file_id:
            result['error'] = f"Could not extract number from filename: {txt_filename}"
            safe_print(f"⚠️ Failed: {txt_url} - filename '{txt_filename}' has no number")
            return result
        
        result['file_id'] = file_id
        result['filename'] = txt_filename
        
        # Construct JSON URL using the extracted file ID
        json_url = json_url_template.format(file_id)
        
        # Download JSON file
        json_content = download_json_file(json_url)
        if not json_content:
            result['error'] = f"Failed to download JSON from {json_url}"
            return result
        
        # Parse JSON
        manifest_data = json.loads(json_content)
        
        # Parse transcription
        transcription_dict = parse_transcription_txt(txt_content)
        
        # Merge transcription into manifest
        found_matches = merge_transcription_into_manifest(manifest_data, transcription_dict)
        
        # Save individual manifest
        output_filename = f"{output_dir}/merged_{file_id}.json"
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(manifest_data, f, indent=2, ensure_ascii=False)
        
        result['success'] = True
        result['manifest'] = manifest_data
        result['transcription_count'] = len(transcription_dict)
        result['merged_count'] = found_matches
        
        safe_print(f"✓ [{file_id}] '{txt_filename}' → {len(transcription_dict)} sections, {found_matches} canvases")
        
    except json.JSONDecodeError as e:
        result['error'] = f"JSON parse error: {e}"
        safe_print(f"⚠️ [{result.get('file_id', '?')}] JSON parse error: {e}")
    except Exception as e:
        result['error'] = f"Processing error: {e}"
        safe_print(f"⚠️ [{result.get('file_id', '?')}] Error: {e}")
    
    return result

def create_combined_manifest(all_manifests, file_ids):
    """Create a comprehensive manifest containing all items."""
    if not all_manifests:
        return None
    
    combined = {
        "@context": "http://iiif.io/api/presentation/3/context.json",
        "id": "https://data.globalise.huygens.knaw.nl/manifests/inventories/combined_all.json",
        "type": "Manifest",
        "label": {
            "en": [f"Combined Manifest - {len(all_manifests)} Documents"]
        },
        "metadata": [
            {
                "label": {"en": ["Collection"]},
                "value": {"en": ["NL-HaNA VOC Archives"]}
            },
            {
                "label": {"en": ["Total Documents"]},
                "value": {"en": [str(len(all_manifests))]}
            },
            {
                "label": {"en": ["Document IDs"]},
                "value": {"en": [", ".join(sorted(file_ids))]}
            }
        ],
        "rights": "https://creativecommons.org/publicdomain/mark/1.0/",
        "items": []
    }
    
    canvas_count = 0
    for manifest in all_manifests:
        if 'items' in manifest:
            combined['items'].extend(manifest['items'])
            canvas_count += len(manifest['items'])
    
    safe_print(f"\n✓ Combined manifest: {canvas_count} canvases from {len(all_manifests)} documents")
    
    return combined

def main(tab_file_path, max_workers=15):
    """
    Main processing function.
    
    Parameters:
    -----------
    tab_file_path : str
        Path to the .tab file containing TXT download URLs
    max_workers : int
        Number of concurrent workers (default: 15)
    """
    
    print("="*70)
    print("IIIF Manifest Batch Processor (Local Jupyter)")
    print("="*70)
    
    # Check if tab file exists
    if not os.path.exists(tab_file_path):
        print(f"❌ Error: File not found: {tab_file_path}")
        return
    
    print(f"\n✓ Reading from: {tab_file_path}")
    
    # Read URLs from file
    with open(tab_file_path, 'r', encoding='utf-8') as f:
        tab_content = f.read()
    
    txt_urls = [line.strip() for line in tab_content.split('\n') if line.strip()]
    
    print(f"✓ Found {len(txt_urls)} TXT download URLs")
    
    # JSON URL template
    json_url_template = "https://data.globalise.huygens.knaw.nl/manifests/inventories/{}.json"
    
    # Create output directory
    output_dir = "merged_manifests"
    os.makedirs(output_dir, exist_ok=True)
    print(f"✓ Output directory: {output_dir}/")
    
    # Process files concurrently
    print(f"\n{'='*70}")
    print(f"Processing {len(txt_urls)} files concurrently...")
    print(f"Step 1: Download TXT files and extract filenames")
    print(f"Step 2: Use filename numbers to download JSON manifests")
    print(f"Step 3: Merge transcriptions and save")
    print(f"{'='*70}\n")
    
    start_time = time.time()
    all_results = []
    
    # Concurrent processing
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {
            executor.submit(
                process_single_file,
                txt_url,
                json_url_template,
                output_dir
            ): txt_url
            for txt_url in txt_urls
        }
        
        completed = 0
        for future in as_completed(future_to_url):
            txt_url = future_to_url[future]
            try:
                result = future.result()
                all_results.append(result)
                completed += 1
                
                if completed % 5 == 0:
                    safe_print(f"Progress: {completed}/{len(txt_urls)} completed...")
                    
            except Exception as e:
                safe_print(f"⚠️ Unexpected error for {txt_url}: {e}")
                all_results.append({
                    'txt_url': txt_url,
                    'file_id': None,
                    'success': False,
                    'manifest': None,
                    'error': str(e)
                })
    
    elapsed_time = time.time() - start_time
    
    # Separate results
    successful_results = [r for r in all_results if r['success']]
    failed_results = [r for r in all_results if not r['success']]
    
    all_manifests = [r['manifest'] for r in successful_results]
    successful_ids = [r['file_id'] for r in successful_results]
    
    # Create combined manifest
    print(f"\n{'='*70}")
    print("Creating combined manifest...")
    print(f"{'='*70}")
    
    combined_manifest = create_combined_manifest(all_manifests, successful_ids)
    
    if combined_manifest:
        combined_filename = f"{output_dir}/combined_all_manifests.json"
        with open(combined_filename, 'w', encoding='utf-8') as f:
            json.dump(combined_manifest, f, indent=2, ensure_ascii=False)
        print(f"✓ Saved: {combined_filename}")
    
    # Create ZIP
    print(f"\n{'='*70}")
    print("Creating ZIP archive...")
    print(f"{'='*70}")
    
    zip_filename = "merged_manifests.zip"
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files_list in os.walk(output_dir):
            for file in files_list:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, output_dir)
                zipf.write(file_path, arcname)
    
    print(f"✓ Created: {zip_filename}")
    
    # Summary
    print(f"\n{'='*70}")
    print("PROCESSING COMPLETE")
    print(f"{'='*70}")
    print(f"⏱️  Total time: {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
    print(f"⚡ Avg speed: {elapsed_time/len(txt_urls):.2f} seconds per file")
    print(f"✓ Successfully processed: {len(successful_results)} files")
    print(f"✗ Failed: {len(failed_results)} files")
    
    if successful_results:
        print(f"\nSuccessfully processed IDs: {', '.join(sorted(successful_ids)[:10])}", end="")
        if len(successful_ids) > 10:
            print(f"... (+{len(successful_ids)-10} more)")
        else:
            print()
    
    if failed_results:
        print(f"\nFailed files:")
        for result in failed_results[:10]:
            file_id = result.get('file_id', '?')
            error = result.get('error', 'Unknown error')
            print(f"  [{file_id}] {error}")
        if len(failed_results) > 10:
            print(f"  ... and {len(failed_results) - 10} more")
    
    print(f"\n{'='*70}")
    print(f"Output files saved in current directory:")
    print(f"  • {output_dir}/ - Individual manifests ({len(all_manifests)} files)")
    print(f"  • {output_dir}/combined_all_manifests.json - Combined manifest")
    print(f"  • {zip_filename} - ZIP archive of all files")
    print(f"{'='*70}")
    print("\n✓ Done!")


# Example usage:
# Uncomment and modify the path to your .tab file

main('globalise_transcriptions_v2_txt.tab')

# Or with custom number of workers:
# main('urls.tab', max_workers=20)