# Fetch Clean CrossRef Journal Data

In [1]:
import sys
import subprocess

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required_packages = ['backoff']
for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)
        print(f"{package} installed successfully!")

import pandas as pd
import requests
import json
import time
import os
from urllib.parse import quote
import backoff  # You'll need to pip install backoff
from pathlib import Path
import logging
from typing import Optional, Dict, List
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CrossRefClient:
    """Client for making polite requests to CrossRef API"""
    
    def __init__(self, email: str, cache_dir: str = "crossref_cache"):
        self.email = email
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': f'JournalDataCleaner/1.0 (mailto:{email})'
        })
        
    def _get_cache_path(self, issn: str) -> Path:
        """Get path for cached response"""
        return self.cache_dir / f"{issn}.json"
    
    def _is_cached(self, issn: str) -> bool:
        """Check if response is already cached"""
        return self._get_cache_path(issn).exists()
    
    def _save_to_cache(self, issn: str, data: Dict) -> None:
        """Save response to cache"""
        with open(self._get_cache_path(issn), 'w') as f:
            json.dump(data, f, indent=2)
    
    def _load_from_cache(self, issn: str) -> Optional[Dict]:
        """Load response from cache"""
        try:
            with open(self._get_cache_path(issn), 'r') as f:
                return json.load(f)
        except:
            return None
            
    @backoff.on_exception(
        backoff.expo,
        (requests.exceptions.RequestException, requests.exceptions.HTTPError),
        max_tries=8
    )
    def get_journal_metadata(self, issn: str) -> Optional[Dict]:
        """
        Get journal metadata from CrossRef API with caching and polite waiting
        """
        if not issn or pd.isna(issn):
            return None
            
        # Clean ISSN
        issn = str(issn).strip()
        if not issn:
            return None
            
        # Check cache first
        if self._is_cached(issn):
            #logger.info(f"Loading {issn} from cache")
            return self._load_from_cache(issn)
            
        # Make request with polite pool
        url = f"https://api.crossref.org/journals/{quote(issn)}"
        params = {'mailto': self.email}
        
        logger.info(f"Requesting {issn} from API")
        response = self.session.get(url, params=params)
        
        # Handle response
        if response.status_code == 404:
            logger.warning(f"ISSN {issn} not found")
            return None
            
        response.raise_for_status()
        data = response.json()
        
        # Cache successful response
        self._save_to_cache(issn, data)
        
        # Polite waiting
        time.sleep(1)
        
        return data

def clean_issns(df: pd.DataFrame) -> pd.DataFrame:
    """Clean and extract all ISSNs from dataframe"""
    
    def extract_additional_issns(val):
        if pd.isna(val):
            return []
        try:
            return str(val).split(',')
        except:
            return []
    
    # Collect all ISSNs
    issns = []
    
    # Add pissn and eissn
    issns.extend(df['pissn'].dropna().unique())
    issns.extend(df['eissn'].dropna().unique())
    
    # Add additional ISSNs
    additional = df['additionalIssns'].apply(extract_additional_issns)
    issns.extend([issn for sublist in additional for issn in sublist])
    
    # Clean and deduplicate
    issns = [str(issn).strip() for issn in issns if issn and not pd.isna(issn)]
    issns = list(set(issns))
    
    return issns

def main():
    # Load configuration
    EMAIL = "amy.kirchhoff@ithaka.org"  # Replace with your email
    
    # Read CSV file
    logger.info("Reading CSV file...")
    df = pd.read_csv('CrossRef Data/titleFile.csv', encoding='utf-8')
    
    # Clean and extract ISSNs
    logger.info("Extracting ISSNs...")
    issns = clean_issns(df)
    logger.info(f"Found {len(issns)} unique ISSNs")
    
    # Initialize client
    client = CrossRefClient(EMAIL)


    i=0
    # Process each ISSN
    results = []
    for issn in issns:
        i+=1
        if (i%1000==0):
            logger.info("Processed " + str(i) + " issn")
        try:
            metadata = client.get_journal_metadata(issn)
            if metadata:
                results.append(metadata)
        except Exception as e:
            logger.error(f"Error processing ISSN {issn}: {str(e)}")
            continue
    
    # Save results
    output_file = "crossref_journal_metadata.json"
    logger.info(f"Saving {len(results)} results to {output_file}")
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    logger.info("Done")

if __name__ == "__main__":
    main()

INFO:__main__:Reading CSV file...
INFO:__main__:Extracting ISSNs...
INFO:__main__:Found 180223 unique ISSNs
INFO:__main__:Processed 1000 issn
INFO:__main__:Processed 2000 issn
INFO:__main__:Processed 3000 issn
INFO:__main__:Processed 4000 issn
INFO:__main__:Processed 5000 issn
INFO:__main__:Processed 6000 issn
INFO:__main__:Processed 7000 issn
INFO:__main__:Processed 8000 issn
INFO:__main__:Processed 9000 issn
INFO:__main__:Processed 10000 issn
INFO:__main__:Processed 11000 issn
INFO:__main__:Processed 12000 issn
INFO:__main__:Processed 13000 issn
INFO:__main__:Processed 14000 issn
INFO:__main__:Processed 15000 issn
INFO:__main__:Processed 16000 issn
INFO:__main__:Processed 17000 issn
INFO:__main__:Processed 18000 issn
INFO:__main__:Processed 19000 issn
INFO:__main__:Processed 20000 issn
INFO:__main__:Processed 21000 issn
INFO:__main__:Processed 22000 issn
INFO:__main__:Processed 23000 issn
INFO:__main__:Processed 24000 issn
INFO:__main__:Processed 25000 issn
INFO:__main__:Processed 26