# CrossReference Harvard to CrossRef Data

## Import statements

In [1]:
import pandas as pd
import json
import os
import glob
import logging
import sys
from datetime import datetime
from typing import List, Dict, Any, Set, Optional
from pathlib import Path

## Set up global variables

In [2]:
LOGGING_FILE_NAME = "logs/harvard_crossref_match_"
HARVARD_MD_FILE = "Output Data/Harvard Data with LCC Subjects and only ISSN.xlsx"
CROSSREF_MD_DIR = "crossref_cache"
OUTPUT_MD_FILE_ROOT = "harvard_crossref_match_final"

## Set up Logging

In [3]:
# Setup logging first
def setup_logging():
    """Configure logging with both file and console handlers with Unicode support."""
    # Create logs directory if it doesn't exist
    os.makedirs('logs', exist_ok=True)
    
    # Create formatters for different levels of detail
    brief_formatter = logging.Formatter('%(message)s')
    verbose_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    
    # Create and configure handlers
    log_filename = f"logs/harvard_crossref_match_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
    
    # Clear any existing handlers
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
    
    # File handler with UTF-8 encoding
    file_handler = logging.FileHandler(log_filename, encoding='utf-8')
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(verbose_formatter)
    
    # Console handler with system encoding
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(brief_formatter)
    
    # Configure root logger
    logger.setLevel(logging.DEBUG)
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
    return logger

## Set up for Processing

In [4]:
def create_checkpoint_dir():
    """Create directory for checkpoint files if it doesn't exist."""
    checkpoint_dir = Path("Harvard Analysis Checkpoints")
    checkpoint_dir.mkdir(exist_ok=True)
    return checkpoint_dir

## Normalize the ISSN

In [5]:
def normalize_issn(issn: str) -> str:
    """
    Normalize ISSN to standard format (removes spaces, 'ISSN', and hyphens)
    Returns None if the ISSN doesn't look valid after normalization
    Valid ISSN format:
    - Must be 8 characters
    - First 7 characters must be digits
    - Last character can be a digit or 'X'
    """
    if not isinstance(issn, str):
        issn = str(issn)
    
    # Remove common prefixes, spaces, and hyphens
    issn = issn.upper().replace('ISSN', '').replace(' ', '').replace('-', '').strip()
    
    # Basic validation:
    # 1. Should be 8 characters after normalization
    if len(issn) != 8:
        return None
        
    # 2. First 7 characters must be digits
    if not issn[:7].isdigit():
        return None
        
    # 3. Last character must be a digit or 'X'
    if not (issn[-1].isdigit() or issn[-1] == 'X'):
        return None
        
    return issn

def validate_issn_list(issns: Set[str]) -> Set[str]:
    """
    Take a set of ISSNs and return only the valid normalized ones
    """
    valid_issns = set()
    for issn in issns:
        normalized = normalize_issn(issn)
        if normalized:
            valid_issns.add(normalized)
    return valid_issns

## CrossRef diagnostics

In [6]:
def save_crossref_diagnostic(issn_to_data: Dict[str, List[Dict]], checkpoint_dir: Path):
    """Save CrossRef data to Excel for diagnostic purposes."""
    logger = logging.getLogger()
    
    # Create a list to hold the flattened data
    flattened_data = []
    
    for issn, data_list in issn_to_data.items():
        for data in data_list:
            message = data['data']['message']
            flattened_data.append({
                'issn': issn,
                'filename': data['filename'],
                'publisher': message.get('publisher', ''),
                'title': message.get('title', ''),
                'all_issns': message.get('ISSN', []),  # Now stored as a list
                'total_dois': message.get('counts', {}).get('total-dois', 0)
            })
    
    if flattened_data:
        df = pd.DataFrame(flattened_data)
        
        # Convert lists to strings for Excel storage
        for col in df.columns:
            if isinstance(df[col].iloc[0], list):
                df[col] = df[col].apply(lambda x: str(x) if x else '[]')
                
        output_file = checkpoint_dir / "crossref_data_diagnostic.xlsx"
        df.to_excel(output_file, index=False)
        logger.info(f"Saved CrossRef diagnostic data to {output_file}")
    else:
        logger.warning("No CrossRef data to save for diagnostics")


## Load the Data

In [7]:
def load_harvard_data(excel_path: str) -> pd.DataFrame:
    """Load the Harvard journal data from Excel file."""
    logger = logging.getLogger()
    logger.info(f"Loading Harvard data from {excel_path}")
    df = pd.read_excel(excel_path)
    logger.info(f"Loaded {len(df)} rows from Harvard data")
    
    # Sample some ISSNs for validation
    sample_rows = df.head(5)
    logger.info("Sample of Harvard ISSN data:")
    for idx, row in sample_rows.iterrows():
        raw_issns = {col: row[col] for col in ['issn', 'issn_l', 'issn_print', 'issn_other_online'] if col in row and pd.notna(row[col])}
        normalized_issns = extract_issns_from_row(row)
        logger.info(f"Row {idx}:")
        logger.info(f"  Raw ISSNs: {raw_issns}")
        logger.info(f"  Normalized ISSNs: {normalized_issns}")
    return df


def load_crossref_data(json_dir: str) -> Dict[str, List[Dict]]:
    """
    Load all CrossRef JSON files and create a mapping of ISSN to file data.
    Returns a dictionary where keys are normalized ISSNs and values are lists of JSON data.
    """
    logger = logging.getLogger()
    issn_to_data = {}
    json_files = list(glob.glob(os.path.join(json_dir, "*.json")))
    logger.info(f"Found {len(json_files)} JSON files to process")
    
    for json_path in glob.glob(os.path.join(json_dir, "*.json")):
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
            # Get ISSNs from both filename and JSON content and normalize them
            filename_issns = validate_issn_list(set(os.path.basename(json_path).replace('.json', '').split('; ')))
            json_issns = validate_issn_list(set(data.get('message', {}).get('ISSN', [])))
            all_issns = filename_issns.union(json_issns)
            
            # Enhanced logging for ISSN processing
            logger.debug(f"Processing {json_path}:")
            logger.debug(f"  - Raw filename ISSNs: {filename_issns}")
            logger.debug(f"  - Raw JSON ISSNs: {json_issns}")
            logger.debug(f"  - Normalized ISSNs: {all_issns}")
            
            if not all_issns:
                logger.warning(f"No valid ISSNs found in {json_path}")
            else:
                logger.debug(f"Found {len(all_issns)} valid ISSNs in {json_path}")
            
            for issn in all_issns:
                if issn not in issn_to_data:
                    issn_to_data[issn] = []
                issn_to_data[issn].append({
                    'data': data,
                    'filename': os.path.basename(json_path)
                })
                    
        except Exception as e:
            logger.error(f"Error processing {json_path}: {str(e)}")
            
    logger.info(f"Loaded data for {len(issn_to_data)} unique ISSNs")
    return issn_to_data



## Extract ISSN from Harvard Data

In [8]:
def extract_issns_from_row(row: pd.Series) -> Set[str]:
    """
    Extract all ISSNs from a row, handling NaN values and normalizing them.
    """
    issn_columns = ['issn', 'issn_l', 'issn_print', 'issn_other_online']
    issns = set()
    
    for col in issn_columns:
        if col in row and pd.notna(row[col]):
            normalized = normalize_issn(str(row[col]).strip())
            if normalized:
                issns.add(normalized)
            
    return issns


## Process CrossRef Matches

In [9]:
def extract_year_range(breakdown_data: List) -> tuple:
    """Extract start and end years from dois-by-issued-year data."""
    if not breakdown_data:
        return None, None
        
    years = [year for year, _ in breakdown_data]
    return min(years), max(years)

def process_crossref_matches(harvard_row: pd.Series, matched_data: List[Dict]) -> Dict:
    """Process all matching CrossRef records for a Harvard row."""
    publishers = set()
    titles = set()
    current_dois = set()
    backfile_dois = set()
    total_dois = set()
    subjects = set()
    all_issns = set()
    filenames = set()
    start_years = []
    end_years = []
    
    for match in matched_data:
        data = match['data']['message']
        
        publishers.add(data.get('publisher', ''))
        titles.add(data.get('title', ''))
        
        counts = data.get('counts', {})
        current_dois.add(str(counts.get('current-dois', 0)))
        backfile_dois.add(str(counts.get('backfile-dois', 0)))
        total_dois.add(str(counts.get('total-dois', 0)))
        
        # Add each subject individually to the set
        subjects.update(data.get('subjects', []))
        all_issns.update(data.get('ISSN', []))
        filenames.add(match['filename'])
        
        breakdown = data.get('breakdowns', {}).get('dois-by-issued-year', [])
        start_year, end_year = extract_year_range(breakdown)
        if start_year:
            start_years.append(start_year)
        if end_year:
            end_years.append(end_year)
    
    # Remove empty strings from sets
    publishers.discard('')
    titles.discard('')
    subjects.discard('')
    
    return {
        'crossref_publishers': list(publishers),
        'crossref_titles': list(titles),
        'crossref_current_dois': list(current_dois),
        'crossref_backfile_dois': list(backfile_dois),
        'crossref_total_dois': list(total_dois),
        'crossref_subjects': list(subjects),
        'crossref_issns': list(all_issns),
        'crossref_filenames': list(filenames),
        'crossref_start_year': min(start_years) if start_years else None,
        'crossref_end_year': max(end_years) if end_years else None
    }



## Do the Work

In [10]:
def process_data(harvard_df: pd.DataFrame, crossref_data: Dict[str, List[Dict]], 
                checkpoint_dir: Path, batch_size: int = 1000) -> pd.DataFrame:
    """
    Process the Harvard data in batches, matching with CrossRef data.
    Includes all Harvard records and all CrossRef records in the output.
    Saves checkpoint files after each batch.
    Returns the final combined DataFrame.
    """
    logger = logging.getLogger()
    results = []
    total_rows = len(harvard_df)
    processed_crossref_issns = set()
    
    # First, process all Harvard records
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch = harvard_df.iloc[start_idx:end_idx].copy()
        
        logger.info(f"Processing Harvard batch {start_idx} to {end_idx} of {total_rows}")
        
        for idx, row in batch.iterrows():
            row_issns = extract_issns_from_row(row)
            matched_data = []
            row_processed_issns = set()
            
            for issn in row_issns:
                normalized_issn = normalize_issn(issn)
                if normalized_issn and normalized_issn in crossref_data:
                    matched_data.extend(crossref_data[normalized_issn])
                    row_processed_issns.add(normalized_issn)
            
            processed_crossref_issns.update(row_processed_issns)
            
            if matched_data:
                crossref_info = process_crossref_matches(row, matched_data)
                results.append({**row.to_dict(), **crossref_info})
            else:
                # Include Harvard record with empty CrossRef fields
                empty_crossref = {
                    'crossref_publishers': [],
                    'crossref_titles': [],
                    'crossref_current_dois': [],
                    'crossref_backfile_dois': [],
                    'crossref_total_dois': [],
                    'crossref_subjects': [],
                    'crossref_issns': [],
                    'crossref_filenames': [],
                    'crossref_start_year': None,
                    'crossref_end_year': None
                }
                results.append({**row.to_dict(), **empty_crossref})
        
        # Save checkpoint after each batch
        if results:
            checkpoint_df = pd.DataFrame(results)
            checkpoint_file = checkpoint_dir / f"harvard_crossref_match_checkpoint_{end_idx}.xlsx"
            
            # Convert lists to strings for Excel storage
            for col in checkpoint_df.columns:
                if col in checkpoint_df and len(checkpoint_df) > 0:
                    if isinstance(checkpoint_df[col].iloc[0], list):
                        checkpoint_df[col] = checkpoint_df[col].apply(lambda x: str(x) if x else '[]')
            
            checkpoint_df.to_excel(checkpoint_file, index=False)
            logger.info(f"Saved checkpoint file: {checkpoint_file}")
    
    # Now process remaining CrossRef records that weren't matched to any Harvard record
    logger.info(f"Processing unmatched CrossRef records")
    unmatched_issns = set(crossref_data.keys()) - processed_crossref_issns
    logger.info(f"Found {len(unmatched_issns)} unmatched CrossRef ISSNs")
    
    # Process unmatched CrossRef records in batches
    unmatched_list = list(unmatched_issns)
    unmatched_batch_size = 5000  # Can be adjusted based on memory constraints
    
    for start_idx in range(0, len(unmatched_list), unmatched_batch_size):
        end_idx = min(start_idx + unmatched_batch_size, len(unmatched_list))
        batch_issns = unmatched_list[start_idx:end_idx]
        
        logger.info(f"Processing unmatched CrossRef batch {start_idx} to {end_idx} of {len(unmatched_list)}")
        
        for issn in batch_issns:
            if issn in crossref_data:
                matched_data = crossref_data[issn]
                
                # Process the CrossRef data
                crossref_info = process_crossref_matches(pd.Series(), matched_data)
                
                # Create a new row with empty Harvard data and the CrossRef data
                # Get the Harvard column names from the first result, if available
                if results:
                    harvard_columns = [col for col in results[0].keys() 
                                    if not col.startswith('crossref_')]
                    
                    # Create empty Harvard data
                    empty_harvard = {col: None for col in harvard_columns}
                    
                    # Add the ISSN to appropriate Harvard fields for reference
                    empty_harvard['issn'] = issn
                    
                    # Add CrossRef data to main columns - assuming standard column order:
                    # title (B), publisher (C), start_year (D), end_year (E), format (F)
                    if crossref_info['crossref_titles'] and len(crossref_info['crossref_titles']) > 0:
                        empty_harvard['title'] = crossref_info['crossref_titles'][0]
                    
                    if crossref_info['crossref_publishers'] and len(crossref_info['crossref_publishers']) > 0:
                        empty_harvard['publisher'] = crossref_info['crossref_publishers'][0]
                    
                    empty_harvard['start_year'] = crossref_info['crossref_start_year']
                    empty_harvard['end_year'] = crossref_info['crossref_end_year']
                    
                    # Add "online" format
                    empty_harvard['format'] = ['online']
                    
                    # Combine with CrossRef data
                    results.append({**empty_harvard, **crossref_info})
        
        # Save checkpoint after each batch of unmatched CrossRef records
        if results:
            checkpoint_df = pd.DataFrame(results)
            checkpoint_file = checkpoint_dir / f"harvard_crossref_match_checkpoint_unmatched_{end_idx}.xlsx"
            
            # Convert lists to strings for Excel storage
            for col in checkpoint_df.columns:
                if col in checkpoint_df and len(checkpoint_df) > 0:
                    if isinstance(checkpoint_df[col].iloc[0], list):
                        checkpoint_df[col] = checkpoint_df[col].apply(lambda x: str(x) if x else '[]')
            
            checkpoint_df.to_excel(checkpoint_file, index=False)
            logger.info(f"Saved unmatched CrossRef checkpoint file: {checkpoint_file}")
    
    # Return the final DataFrame
    if results:
        final_df = pd.DataFrame(results)
        return final_df
    return None


## Main

In [11]:
def main():
    """Main function to run the analysis."""
    logger = setup_logging()
    checkpoint_dir = create_checkpoint_dir()
    
    try:
        # Load Harvard data
        harvard_df = load_harvard_data(HARVARD_MD_FILE)
        logger.info(f"Loaded {len(harvard_df)} Harvard records")
        
        # Load CrossRef data
        crossref_data = load_crossref_data(CROSSREF_MD_DIR)
        logger.info(f"Loaded CrossRef data with {len(crossref_data)} unique ISSNs")
        
        # Log some sample matches for debugging
        sample_issns = list(harvard_df['issn'].dropna().head())
        logger.info("Checking sample ISSNs for matches:")
        for issn in sample_issns:
            normalized = normalize_issn(str(issn))
            if normalized in crossref_data:
                logger.info(f"Found match for ISSN {issn} (normalized: {normalized})")
            else:
                logger.info(f"No match for ISSN {issn} (normalized: {normalized})")
        
        # Save diagnostic data
        save_crossref_diagnostic(crossref_data, checkpoint_dir)
        
        # Process data with checkpoints
        final_df = process_data(harvard_df, crossref_data, checkpoint_dir)
        
        if final_df is not None and not final_df.empty:
            # Log match statistics
            match_count = len(final_df[final_df['crossref_filenames'].str.len() > 2])  # > 2 because empty list is '[]'
            logger.info(f"Found {match_count} records with CrossRef matches out of {len(final_df)} total records")
            
            # Save final output
            final_output = checkpoint_dir / f"{OUTPUT_MD_FILE_ROOT}.xlsx"
            final_df.to_excel(final_output, index=False)
            logger.info(f"Analysis complete. Final output saved to: {final_output}")
        else:
            logger.warning("No matches found in the data - this is unexpected if checkpoint files exist")
            logger.info("Please check:")
            logger.info("1. The Harvard ISSN columns contain valid ISSNs")
            logger.info("2. The CrossRef data contains matching ISSNs")
            logger.info("3. The ISSN normalization is working as expected")

        if final_df is not None and not final_df.empty:
            # Log match statistics
            match_count = len(final_df[final_df['crossref_filenames'].str.len() > 2])  # > 2 because empty list is '[]'
            harvard_count = len(final_df[~final_df['title'].isna()])  # Count rows with Harvard title
            crossref_only_count = len(final_df[final_df['title'].isna()])  # Count rows without Harvard title (CrossRef only)
            
            logger.info(f"Found {match_count} records with CrossRef matches out of {harvard_count} Harvard records")
            logger.info(f"Added {crossref_only_count} CrossRef-only records (not in Harvard data)")
            logger.info(f"Total records in output: {len(final_df)}")

    except Exception as e:
        logger.error(f"Error in main process: {str(e)}", exc_info=True)

if __name__ == "__main__":
    main()


Loading Harvard data from Output Data/Harvard Data with LCC Subjects and only ISSN.xlsx
Loaded 303615 rows from Harvard data
Sample of Harvard ISSN data:
Row 0:
  Raw ISSNs: {'issn': '0333-6883', 'issn_l': '0333-6883'}
  Normalized ISSNs: {'03336883'}
Row 1:
  Raw ISSNs: {'issn': '0041-7750', 'issn_l': '0041-7750'}
  Normalized ISSNs: {'00417750'}
Row 2:
  Raw ISSNs: {'issn': '0334-4738'}
  Normalized ISSNs: {'03344738'}
Row 3:
  Raw ISSNs: {'issn': '0073-4217', 'issn_l': '0073-4217'}
  Normalized ISSNs: {'00734217'}
Row 4:
  Raw ISSNs: {'issn': '0792-7932'}
  Normalized ISSNs: {'07927932'}
Loaded 303615 Harvard records
Found 180222 JSON files to process
Loaded data for 180253 unique ISSNs
Loaded CrossRef data with 180253 unique ISSNs
Checking sample ISSNs for matches:
No match for ISSN 0333-6883 (normalized: 03336883)
No match for ISSN 0041-7750 (normalized: 00417750)
No match for ISSN 0334-4738 (normalized: 03344738)
No match for ISSN 0073-4217 (normalized: 00734217)
No match for ISS