# Add LCC Headings to Harvard MD File

## Import what's needed

In [1]:
import sys
import subprocess

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required_packages = ['xlsxwriter']
for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)
        print(f"{package} installed successfully!")

import pandas as pd
import re
import os
import logging
from datetime import datetime

Installing xlsxwriter...
xlsxwriter installed successfully!


## Set up some logging

In [2]:
def setup_logging():
    """Configure logging with both file and console handlers."""
    # Create logs directory if it doesn't exist
    os.makedirs('logs', exist_ok=True)
    
    # Create formatters for different levels of detail
    brief_formatter = logging.Formatter('%(message)s')
    verbose_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    
    # Create and configure handlers
    log_filename = f"logs/mods_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
    
    # Clear any existing handlers
    logger = logging.getLogger()
    for handler in logger.handlers[:]:
        logger.removeHandler(handler)
        
    # File handler - gets everything with full detail
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(verbose_formatter)
    
    # Console handler - gets just the important stuff briefly
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(brief_formatter)
    
    # Configure root logger
    logger.setLevel(logging.DEBUG)
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
    return logger

## Lets take a look at some LCC codes

In [3]:
def extract_lcc_codes(lcc):
    """Extract the class and subclass from an LCC code."""
    if pd.isna(lcc):
        return None, None
    
    # Extract first letter (Class) and first two letters (Subclass) if they exist
    class_match = re.match(r'^([A-Z])', lcc)
    subclass_match = re.match(r'^([A-Z]{2})', lcc)
    
    lcc_class = class_match.group(1) if class_match else None
    subclass = subclass_match.group(1) if subclass_match else None
    
    return lcc_class, subclass

def create_lcc_mapping(mapping_df, logger):
    """Create dictionaries for both class and subclass mappings."""
    class_mapping = {}
    subclass_mapping = {}
    
    logger.info("Creating LCC mappings...")
    logger.debug(f"Processing {len(mapping_df)} mapping entries")
    
    for _, row in mapping_df.iterrows():
        # Map single letter class
        if pd.notna(row['Class']):
            class_mapping[row['Class']] = row['Subject']
            logger.debug(f"Added class mapping: {row['Class']} -> {row['Subject']}")
        
        # Map two-letter subclass
        if pd.notna(row['Subclass']) and len(row['Subclass']) == 2:
            subclass_mapping[row['Subclass']] = row['Subject']
            logger.debug(f"Added subclass mapping: {row['Subclass']} -> {row['Subject']}")
    
    logger.info(f"Created mappings for {len(class_mapping)} classes and {len(subclass_mapping)} subclasses")
    return class_mapping, subclass_mapping


# Read in and Deal with the Harvard MD File

In [4]:
def process_harvard_metadata(harvard_file, mapping_file, output_file):
    """Process Harvard metadata with LCC mappings and save to new file."""
    # Set up logging
    logger = setup_logging()
    
    # Read the files
    logger.info(f"Reading Harvard metadata from {harvard_file}")
    harvard_df = pd.read_excel(harvard_file)
    logger.debug(f"Read {len(harvard_df)} records from Harvard metadata")
    
    logger.info(f"Reading LCC mapping from {mapping_file}")
    mapping_df = pd.read_excel(mapping_file)
    logger.debug(f"Read {len(mapping_df)} mapping entries")
    
    # Create LCC mappings
    class_mapping, subclass_mapping = create_lcc_mapping(mapping_df, logger)
    
    # Process each LCC code
    logger.info("Processing LCC codes...")
    subjects_list = []
    processed_count = 0
    error_count = 0
    
    for index, lcc in enumerate(harvard_df['lcc']):
        try:
            lcc_class, subclass = extract_lcc_codes(lcc)
            
            # Build hierarchical subject list
            subjects = []
            if lcc_class and lcc_class in class_mapping:
                subjects.append(class_mapping[lcc_class])
                # Only log the class letter and mapping, not the full LCC
                logger.debug(f"Class {lcc_class}: {class_mapping[lcc_class]}")
            if subclass and subclass in subclass_mapping:
                subjects.append(subclass_mapping[subclass])
                # Only log the subclass and mapping, not the full LCC
                logger.debug(f"Subclass {subclass}: {subclass_mapping[subclass]}")
            
            subjects_list.append(subjects if subjects else None)
            processed_count += 1
            
            if (index + 1) % 10000 == 0:  # Log progress every 100 records
                logger.info(f"Processed {index + 1} records...")
                
        except Exception as e:
            # Simplify error logging to avoid problematic characters
            logger.error(f"Error processing record {index}: {str(e)}")
            subjects_list.append(None)
            error_count += 1
    
    # Add new column
    harvard_df['lcc_subjects'] = subjects_list
    
    # Save to new file as XLSX
    logger.info(f"Saving processed data to {output_file}")
    
    # Convert list to string representation for Excel
    harvard_df['lcc_subjects'] = harvard_df['lcc_subjects'].apply(lambda x: str(x) if x else '')
    
    # Create Excel writer object with xlsxwriter engine
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        harvard_df.to_excel(writer, index=False, sheet_name='Harvard Metadata')
        
        # Auto-adjust column widths
        worksheet = writer.sheets['Harvard Metadata']
        for idx, col in enumerate(harvard_df.columns):
            series = harvard_df[col]
            max_len = max(
                series.astype(str).apply(len).max(),  # len of largest item
                len(str(series.name))  # len of column name/header
            ) + 1  # adding a little extra space
            worksheet.set_column(idx, idx, max_len)  # set column width
    
    # Log final statistics
    logger.info("Processing complete!")
    logger.info(f"Total records processed: {len(harvard_df)}")
    logger.info(f"Successful mappings: {processed_count}")
    logger.info(f"Errors encountered: {error_count}")
    
    return harvard_df

## Run the Processing

In [5]:
# Run the processing
harvard_file = 'Output Data/mods_analysis_final_20250214_142225.xlsx'
mapping_file = 'lcc_mapping.xlsx'
output_file = 'Output Data/Harvard Data with LCC Subjects.xlsx'  # Changed extension to .xlsx

processed_df = process_harvard_metadata(harvard_file, mapping_file, output_file)

# Display sample results
print("\nSample results:")
for i, row in processed_df.head().iterrows():
    print(f"\nLCC: {row['lcc']}")
    print(f"Subjects: {row['lcc_subjects']}")

Reading Harvard metadata from Output Data/mods_analysis_final_20250214_142225.xlsx
Reading LCC mapping from lcc_mapping.xlsx
Creating LCC mappings...
Created mappings for 21 classes and 184 subclasses
Processing LCC codes...
Processed 10000 records...
Processed 20000 records...
Processed 30000 records...
Processed 40000 records...
Processed 50000 records...
Processed 60000 records...
Processed 70000 records...
Processed 80000 records...
Processed 90000 records...
Processed 100000 records...
Processed 110000 records...
Processed 120000 records...
Processed 130000 records...
Processed 140000 records...
Processed 150000 records...
Processed 160000 records...
Processed 170000 records...
Processed 180000 records...
Processed 190000 records...
Processed 200000 records...
Processed 210000 records...
Processed 220000 records...
Processed 230000 records...
Processed 240000 records...
Processed 250000 records...
Processed 260000 records...
Processed 270000 records...
Processed 280000 records...



Sample results:

LCC: nan
Subjects: 

LCC: nan
Subjects: 

LCC: nan
Subjects: 

LCC: nan
Subjects: 

LCC: nan
Subjects: 
