## SCImago vs OpenAlex coverage (1999-2024) via ISSN OR Title

This notebook computes coverage of `data/processed/scimagojr_communication_journal_1999_2024.csv` against `data/processed/openalex_merged.parquet` using ISSN or normalized Title matching, mirroring `scripts/03_analysis/scim_openalex_journal_coverage.py` logic (no year filter).

Outputs (to `outputs/reports/`):
- `scim_openalex_coverage_summary_1999_2024.csv`
- `scim_openalex_unmatched_journals_1999_2024.csv`


In [None]:
# Imports and config
from __future__ import annotations
import os
import ast
import json
import subprocess
from pathlib import Path
from typing import List, Optional, Set, Tuple

import pandas as pd


def find_project_root() -> Path:
    # Prefer git repo root
    try:
        root = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip()
        p = Path(root)
        if (p / 'data').exists():
            return p
    except Exception:
        pass
    # Fallback: walk up from current working dir to find data/processed
    cur = Path.cwd()
    for candidate in [cur] + list(cur.parents):
        if (candidate / 'data' / 'processed').exists():
            return candidate
    return cur

ROOT = find_project_root()
SCIM_CSV = ROOT / 'data' / 'processed' / 'scimagojr_communication_journal_1999_2024.csv'
OPENALEX_PARQUET = ROOT / 'data' / 'processed' / 'openalex_merged.parquet'
OUT_DIR = ROOT / 'outputs' / 'reports'
OUT_DIR.mkdir(parents=True, exist_ok=True)

SUMMARY_CSV = OUT_DIR / 'scim_openalex_coverage_summary_1999_2024.csv'
UNMATCHED_CSV = OUT_DIR / 'scim_openalex_unmatched_journals_1999_2024.csv'


In [None]:
# Utilities (normalize & explode like script)

def normalize_issn(value: str) -> Optional[str]:
    if value is None:
        return None
    s = str(value).strip().upper()
    if not s or s == '-' or s == 'NA' or s == 'NAN':
        return None
    s = ''.join(ch for ch in s if ch.isalnum())
    if len(s) < 7:
        return None
    return s


def split_multi_issn(raw: str) -> List[str]:
    if raw is None:
        return []
    s = str(raw)
    for sep in [';', ',', '|', '/', ' ']:
        s = s.replace(sep, ';')
    parts = [p.strip() for p in s.split(';') if p.strip()]
    out: List[str] = []
    for p in parts:
        n = normalize_issn(p)
        if n:
            out.append(n)
    return list(dict.fromkeys(out))


def normalize_title(value: str) -> Optional[str]:
    if value is None:
        return None
    s = str(value).strip().lower()
    if not s:
        return None
    s = ''.join(ch for ch in s if ch.isalnum())
    if not s:
        return None
    return s


def detect_openalex_issn_columns(df_head: pd.DataFrame) -> List[str]:
    candidates = [
        'host_venue.issn','host_venue.issn_l','host_venue_issn','host_venue_issn_l',
        'primary_location.source.issn','primary_location.source.issn_l',
        'primary_location_source_issn','primary_location_source_issn_l',
        'issn','issn_l','journal_issn','journal_issn_l',
        'issn_list','host_venue.issn_list','primary_location.source.issn_list',
    ]
    cols = [c for c in candidates if c in df_head.columns]
    if not cols:
        cols = [c for c in df_head.columns if 'issn' in c.lower()]
    return cols


def detect_openalex_title_columns(df_head: pd.DataFrame) -> List[str]:
    candidates = [
        'host_venue.display_name','host_venue.title','host_venue_name',
        'primary_location.source.display_name','primary_location.source.title',
        'primary_location_source_display_name','primary_location_source_title',
        'source_display_name','journal_title','venue','venue_name','source_title','publication_name',
    ]
    cols = [c for c in candidates if c in df_head.columns]
    if not cols:
        for c in df_head.columns:
            cl = c.lower()
            if 'display_name' in cl or cl.endswith('.name') or 'title' in cl or 'venue' in cl:
                cols.append(c)
    return list(dict.fromkeys(cols))


def explode_issn_columns(df: pd.DataFrame, issn_columns: List[str]) -> pd.DataFrame:
    present = [c for c in issn_columns if c in df.columns]
    if not present:
        return df.assign(_issn_norm=pd.Series(dtype=object)).loc[[]]

    def row_issns(row) -> List[str]:
        collected: List[str] = []
        for col in present:
            val = row[col]
            if pd.isna(val):
                continue
            if isinstance(val, (list, tuple, set)):
                for v in val:
                    n = normalize_issn(v)
                    if n:
                        collected.append(n)
            else:
                if isinstance(val, str) and val.startswith('[') and val.endswith(']'):
                    try:
                        parsed = ast.literal_eval(val)
                        if isinstance(parsed, (list, tuple)):
                            for v in parsed:
                                n = normalize_issn(v)
                                if n:
                                    collected.append(n)
                            continue
                    except Exception:
                        pass
                n = normalize_issn(val)
                if n:
                    collected.append(n)
        return list(dict.fromkeys(collected))

    exploded = df.assign(_issn_list=df.apply(row_issns, axis=1))
    exploded = exploded.explode('_issn_list').rename(columns={'_issn_list':'_issn_norm'})
    exploded = exploded.dropna(subset=['_issn_norm'])
    return exploded


def explode_title_columns(df: pd.DataFrame, title_columns: List[str]) -> pd.DataFrame:
    present = [c for c in title_columns if c in df.columns]
    if not present:
        return df.assign(_title_norm=pd.Series(dtype=object)).loc[[]]

    def row_titles(row) -> List[str]:
        collected: List[str] = []
        for col in present:
            val = row[col]
            if pd.isna(val):
                continue
            if isinstance(val, (list, tuple, set)):
                for v in val:
                    n = normalize_title(v)
                    if n:
                        collected.append(n)
            else:
                if isinstance(val, str) and val.startswith('[') and val.endswith(']'):
                    try:
                        parsed = ast.literal_eval(val)
                        if isinstance(parsed, (list, tuple)):
                            for v in parsed:
                                n = normalize_title(v)
                                if n:
                                    collected.append(n)
                            continue
                    except Exception:
                        pass
                n = normalize_title(val)
                if n:
                    collected.append(n)
        return list(dict.fromkeys(collected))

    exploded = df.assign(_title_list=df.apply(row_titles, axis=1))
    exploded = exploded.explode('_title_list').rename(columns={'_title_list':'_title_norm'})
    exploded = exploded.dropna(subset=['_title_norm'])
    return exploded


In [None]:
# Load SCImago merged CSV and prepare fields
scim = pd.read_csv(SCIM_CSV, dtype=str, low_memory=False)
# Robustly locate columns
cols_lc = {c.lower(): c for c in scim.columns}
issn_col = cols_lc.get('issn', 'Issn' if 'Issn' in scim.columns else None)
title_col = cols_lc.get('title', 'Title' if 'Title' in scim.columns else None)
sourceid_col = cols_lc.get('sourceid', 'Sourceid' if 'Sourceid' in scim.columns else None)
type_col = cols_lc.get('type', 'Type' if 'Type' in scim.columns else None)

if issn_col is None:
    # fallback: any column containing 'issn'
    for c in scim.columns:
        if 'issn' in c.lower():
            issn_col = c
            break
if issn_col is None:
    raise RuntimeError('Could not detect ISSN column in SCImago CSV')

# Parse ISSN list and title norm
scim['_issn_list'] = scim[issn_col].apply(split_multi_issn)
if title_col and title_col in scim.columns:
    scim['_title_norm_scim'] = scim[title_col].apply(normalize_title)

print('SCImago rows:', len(scim))
print('Columns resolved:', {'issn': issn_col, 'title': title_col, 'sourceid': sourceid_col, 'type': type_col})


SCImago rows: 786
Columns resolved: {'issn': 'Issn', 'title': 'Title', 'sourceid': 'Sourceid', 'type': 'Type'}


In [None]:
# Load OpenAlex minimally and build sets
head = pd.read_parquet(OPENALEX_PARQUET, columns=None, engine='pyarrow').head(5)
issn_cols = detect_openalex_issn_columns(head)
title_cols = detect_openalex_title_columns(head)

cols_to_read = list(dict.fromkeys([*issn_cols, *title_cols]))
if not cols_to_read:
    raise RuntimeError('No ISSN or Title columns found in OpenAlex parquet')

oa = pd.read_parquet(OPENALEX_PARQUET, columns=[c for c in cols_to_read if c in head.columns], engine='pyarrow')

issn_exploded = explode_issn_columns(oa, issn_cols) if issn_cols else oa.assign(_issn_norm=pd.Series(dtype=object)).loc[[]]
title_exploded = explode_title_columns(oa, title_cols) if title_cols else oa.assign(_title_norm=pd.Series(dtype=object)).loc[[]]

openalex_issn_set: Set[str] = set(issn_exploded['_issn_norm'].astype(str).tolist()) if not issn_exploded.empty else set()
openalex_title_set: Set[str] = set(title_exploded['_title_norm'].astype(str).tolist()) if not title_exploded.empty else set()

print('OpenAlex columns:', {'issn_cols': issn_cols, 'title_cols': title_cols})
print('OpenAlex ISSN values:', len(openalex_issn_set), 'Title values:', len(openalex_title_set))


OpenAlex columns: {'issn_cols': ['primary_location.source.issn', 'primary_location.source.issn_l'], 'title_cols': ['primary_location.source.display_name']}
OpenAlex ISSN values: 49094 Title values: 42927


In [None]:
# Compute coverage

def covered_row(row) -> bool:
    # ISSN-based
    for s in row.get('_issn_list', []) or []:
        if s in openalex_issn_set:
            return True
    # Title-based
    t = row.get('_title_norm_scim')
    if t and t in openalex_title_set:
        return True
    return False

work = scim.copy()
work['_is_covered'] = work.apply(covered_row, axis=1)

total = int(len(work))
covered_cnt = int(work['_is_covered'].sum())
unmatched_cnt = int(total - covered_cnt)
rate = (covered_cnt / total) if total > 0 else 0.0

summary_df = pd.DataFrame([{
    'total_journals': total,
    'covered_journals': covered_cnt,
    'unmatched_journals': unmatched_cnt,
    'coverage_rate': round(rate, 6),
}])

unmatched = work[~work['_is_covered']].copy()
unmatched['issn_parsed'] = unmatched['_issn_list'].apply(lambda xs: ','.join(xs) if isinstance(xs, list) else '')

# Pick representative columns if present
rep_cols = []
for c in ['Title','Sourceid','Type','Issn','issn_parsed']:
    if c in unmatched.columns:
        rep_cols.append(c)
if rep_cols:
    unmatched_out = unmatched[rep_cols]
else:
    unmatched_out = unmatched

summary_df.to_csv(SUMMARY_CSV, index=False)
unmatched_out.to_csv(UNMATCHED_CSV, index=False)

print('Saved:', SUMMARY_CSV)
print('Saved:', UNMATCHED_CSV)


Saved: /Users/yann.jy/InvisibleResearch/outputs/reports/scim_openalex_coverage_summary_1999_2024.csv
Saved: /Users/yann.jy/InvisibleResearch/outputs/reports/scim_openalex_unmatched_journals_1999_2024.csv
