# Fetch all the Harvard records for journals or serials

## Set things up, import -- logging, etc.

In [1]:
import xml.etree.ElementTree as ET
import os
import re
from xml.dom import minidom
import requests
import time
import logging
from itertools import product
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

#Some Global Variables
OUTPUT_DIR = "mods_records"
OVERWRITE_FILES = False
BASE_URL = "https://api.lib.harvard.edu/v2/items"
LAST_REQUEST_TIME = 0    

# Set up some Logging 

In [2]:
# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Create formatters for different levels of detail
brief_formatter = logging.Formatter('%(message)s')  # Even more concise for console
verbose_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# Create and configure handlers
log_filename = f"logs/harvard_fetch_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

# Clear any existing handlers
logger = logging.getLogger()
for handler in logger.handlers[:]:
    logger.removeHandler(handler)

# File handler - gets everything with full detail
file_handler = logging.FileHandler(log_filename)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(verbose_formatter)

# Console handler - gets just the important stuff briefly
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(brief_formatter)

# Configure root logger
logger.setLevel(logging.DEBUG)  # Capture all levels
logger.addHandler(file_handler)
logger.addHandler(console_handler)

# Other Functions

In [3]:
def rate_limit_wait():
    """Ensure at least 1 second between requests"""
    global LAST_REQUEST_TIME
    time_since_last = time.time() - LAST_REQUEST_TIME
    if time_since_last < 1:
        logging.debug(f"Rate limiting: waiting {1 - time_since_last:.2f} seconds")
        time.sleep(1 - time_since_last)
    LAST_REQUEST_TIME = time.time()


def fetch_mods_data(params):
    """Fetch MODS XML data from the API with rate limiting"""
    rate_limit_wait()
    try:
        logging.debug(f"Making API request to {BASE_URL}")
        logging.debug(f"Request parameters: {params}")
        
        response = requests.get(BASE_URL, params=params)
        response.raise_for_status()
        
        logging.debug(f"Response status code: {response.status_code}")
        return response.text
    except requests.exceptions.RequestException as e:
        logging.error(f"API request failed: {str(e)}")
        logging.debug(f"Full error details: {e}", exc_info=True)
        return None


def get_result_count(params):
    """Get the total number of results for a query"""
    xml_data = fetch_mods_data(params)
    if xml_data:
        try:
            root = ET.fromstring(xml_data)
            num_found = root.find(".//{*}numFound")
            if num_found is not None:
                count = int(num_found.text)
                logging.debug(f"Query returned {count} total results")
                return count
            logging.warning("No numFound element in response")
        except ET.ParseError as e:
            logging.error(f"Failed to parse XML response: {e}")
            logging.debug("XML Parse Error Details:", exc_info=True)
    return 0


def fetch_with_prefix(prefix, record_type='serial'):
    """Fetch records that match a specific record identifier prefix"""
    logging.debug(f"Starting fetch_with_prefix operation:")
    logging.debug(f"Prefix: {prefix}")
    logging.debug(f"Record type: {record_type}")
    logging.debug(f"Output directory: {OUTPUT_DIR}")
    
    params = {
        'recordIdentifier': f"{prefix}*",
        'limit': 250
    }
    
    if record_type == 'serial':
        params['issuance'] = 'serial'
    else:  # periodical
        params['genre'] = 'periodicals'
        
    total_results = get_result_count(params)
    
    if total_results > 0:
        logging.info(f"Found {total_results} {record_type} records for prefix {prefix}")
    
    logging.debug(f"Query parameters: {params}")
    
    if total_results == 0:
        logging.debug(f"Skipping prefix {prefix} - no results")
        return
        
    fetch_all_pages(params, total_results)


def fetch_all_pages(params, total_results):
    """Fetch all pages of results with cursor pagination if needed"""
    records_processed = 0
    batch_number = 0
    last_progress_report = 0
    progress_interval = 1000  # Only show progress every 1000 records
    
    logging.debug(f"Initial parameters: {params}")
    
    if total_results <= params.get('limit', 250):
        logging.debug("Single page fetch - no pagination needed")
        xml_data = fetch_mods_data(params)
        if xml_data:
            process_and_save_records(xml_data)
        return
    
    params['cursor'] = '*'
    
    while records_processed < total_results:
        batch_number += 1
        logging.debug(f"Fetching batch {batch_number}")
        
        xml_data = fetch_mods_data(params)
        if not xml_data:
            logging.error("Failed to fetch data, breaking pagination loop")
            break
            
        root = ET.fromstring(xml_data)
        next_cursor = root.find(".//{*}nextCursor")
        
        saved, skipped, errors = save_mods_records(xml_data, OUTPUT_DIR)
        records_processed += saved + skipped
        
        # Only show progress in console at intervals
        if records_processed - last_progress_report >= progress_interval:
            logging.info(f"Progress: {records_processed}/{total_results} records ({(records_processed/total_results*100):.1f}%)")
            last_progress_report = records_processed
        
        # Detailed batch information goes to debug
        logging.debug(f"Batch {batch_number} complete:")
        logging.debug(f"- Records saved: {saved}")
        logging.debug(f"- Records skipped: {skipped}")
        
        if errors:
            for error in errors:
                logging.error(f"Batch {batch_number} error: {error}")
        
        if next_cursor is None or records_processed >= total_results:
            logging.debug("Pagination complete")
            break
            
        params['cursor'] = next_cursor.text
        logging.debug(f"Next cursor: {params['cursor']}")


def systematic_fetch(start_digits='99', digit_positions=4):
    """Systematically fetch records by trying different numeric prefixes"""
    total_prefixes = 10 ** digit_positions
    current_prefix = 0
    
    logging.info(f"Starting systematic fetch with {total_prefixes} prefixes...")
    
    for digits in product(range(10), repeat=digit_positions):
        current_prefix += 1
        prefix = start_digits + ''.join(map(str, digits))
        
        # Only log to console every 10 prefixes or on first/last
        if current_prefix % 10 == 0 or current_prefix == 1 or current_prefix == total_prefixes:
            logging.info(f"Processing prefix group: {prefix[:-1]}x ({current_prefix}/{total_prefixes})")
        else:
            logging.debug(f"Processing prefix: {prefix}")
        
        fetch_with_prefix(prefix, 'serial')
        fetch_with_prefix(prefix, 'periodical')
    
    logging.info("Systematic fetch complete")


def extract_record_identifier(record):
    """Extract ALMA identifier from MODS record with fallbacks"""
    try:
        # Try to get ALMA identifier
        record_info = record.find(".//{*}recordInfo/{*}recordIdentifier[@source='MH:ALMA']")
        if record_info is not None and record_info.text:
            logging.debug(f"Found ALMA identifier: {record_info.text}")
            return record_info.text
        
        # First fallback: any record identifier
        record_info = record.find(".//{*}recordInfo/{*}recordIdentifier")
        if record_info is not None and record_info.text:
            identifier = f"record_{record_info.text}"
            logging.debug(f"Using generic record identifier: {identifier}")
            return identifier
        
        # Second fallback: title
        title_info = record.find(".//{*}titleInfo/{*}title")
        if title_info is not None and title_info.text:
            identifier = sanitize_filename(title_info.text)
            logging.debug(f"Using sanitized title as identifier: {identifier}")
            return identifier
            
        logging.warning("No valid identifier found, using default")
        return "untitled_record"
    except Exception as e:
        logging.error(f"Error extracting identifier: {str(e)}")
        logging.debug("Identifier extraction error details:", exc_info=True)
        return "error_record"

def sanitize_filename(title):
    """Create a safe filename from a title"""
    safe_title = re.sub(r'[^\w\s-]', '_', title)
    safe_title = re.sub(r'[-\s]+', '_', safe_title)
    return safe_title[:100].strip('_')

def save_mods_records(xml_string, OUTPUT_DIR):
    """Save individual MODS records as separate XML files"""
    errors = []
    saved_count = 0
    skipped_count = 0
    
    try:
        root = ET.fromstring(xml_string)
        records = root.findall(".//{*}mods")
        
        if not records:
            msg = "No MODS records found in XML"
            logging.warning(msg)
            errors.append(msg)
            return 0, 0, errors
            
        for i, record in enumerate(records, 1):
            try:
                record_id = extract_record_identifier(record)
                filename = f"{record_id}.xml"
                filepath = os.path.join(OUTPUT_DIR, filename)
                
                if not OVERWRITE_FILES and os.path.exists(filepath):
                    logging.debug(f"Skipping existing file: {filename}")
                    skipped_count += 1
                    continue
                
                record_str = minidom.parseString(ET.tostring(record)).toprettyxml(indent="  ")
                record_str = '\n'.join(line for line in record_str.split('\n') if line.strip())
                
                if not record_str.startswith('<?xml'):
                    record_str = '<?xml version="1.0" encoding="UTF-8"?>\n' + record_str
                
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(record_str)
                
                saved_count += 1
                logging.debug(f"Saved record {i}: {filename}")
                
            except Exception as e:
                error_msg = f"Error processing record {i}: {str(e)}"
                logging.error(error_msg)
                logging.debug("Record processing error details:", exc_info=True)
                errors.append(error_msg)
        
        # Summary only goes to debug
        logging.debug(f"Batch complete - Saved: {saved_count}, Skipped: {skipped_count}")
        
        return saved_count, skipped_count, errors
        
    except Exception as e:
        error_msg = f"Fatal error processing records: {str(e)}"
        logging.error(error_msg)
        logging.debug("Fatal error details:", exc_info=True)
        errors.append(error_msg)
        return saved_count, skipped_count, errors

def process_and_save_records(xml_string):
    """Process and save the records with logging"""
    saved_count, skipped_count, errors = save_mods_records(xml_string, OUTPUT_DIR)
    logging.info(f"Processed batch: {saved_count} saved, {skipped_count} skipped")
    if errors:
        logging.error(f"Errors: {errors}")
    return saved_count, skipped_count, errors



## Main Analysis

In [4]:
# Create output directory for XML files, if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

logging.info("Let's get started")

# Test with specific prefixes first
#test_prefixes = ['990000924', '990000925']
#for prefix in test_prefixes:
#   fetch_with_prefix(prefix, 'serial')
#   fetch_with_prefix(prefix, 'periodical')
    
# Uncomment to do a systematic fetch
systematic_fetch(start_digits='99', digit_positions=4)

logging.info("Done")

Let's get started
Starting systematic fetch with 10000 prefixes...
Processing prefix group: 99000x (1/10000)
Found 4759 serial records for prefix 990000
Progress: 1000/4759 records (21.0%)
Progress: 2000/4759 records (42.0%)
Progress: 3000/4759 records (63.0%)
Progress: 4000/4759 records (84.1%)
Found 1972 periodical records for prefix 990000
Progress: 1000/1972 records (50.7%)
Found 66270 serial records for prefix 990001
Progress: 1000/66270 records (1.5%)
Progress: 2000/66270 records (3.0%)
Progress: 3000/66270 records (4.5%)
Progress: 4000/66270 records (6.0%)
Progress: 5000/66270 records (7.5%)
Progress: 6000/66270 records (9.1%)
Progress: 7000/66270 records (10.6%)
Progress: 8000/66270 records (12.1%)
Progress: 9000/66270 records (13.6%)
Progress: 10000/66270 records (15.1%)
Progress: 11000/66270 records (16.6%)
Progress: 12000/66270 records (18.1%)
Progress: 13000/66270 records (19.6%)
Progress: 14000/66270 records (21.1%)
Progress: 15000/66270 records (22.6%)
Progress: 16000/662