# JOCS

## OpenAlex

using DBLP metadata, targeting DOI and Title at fallback

In [None]:
import json
import os
import time
from pathlib import Path
import pandas as pd
import requests
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from dotenv import load_dotenv
load_dotenv()


DBLP_DIR = Path(os.environ["DBLP_DIR"])
OPENALEX_DIR = Path(os.environ["OPENALEX_DIR"])
IN_PATH = DBLP_DIR/"interim"/"jocs"/"jocs_dblp_dois.jsonl"
OUT_RAW_JSONL = OPENALEX_DIR/"raw"/"jocs"/"JoCS_openalex_works.jsonl"
# OUT_CORE_PARQUET = OPENALEX_DIR/"raw"/"jocs"/"JoCS_openalex_works.parquet"

OPENALEX_BASE = os.getenv("OA_BASE")
USER_AGENT = os.getenv("USER_OA")

BATCH_SIZE = 50
SLEEP_SEC = 1.0

session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", HTTPAdapter(max_retries=retries))

def normalize_doi(doi):
    if not doi:
        return None
    doi = str(doi).strip().lower()
    for prefix in ("https://doi.org/", "http://doi.org/", "doi:"):
        if doi.startswith(prefix):
            doi = doi[len(prefix):]
    return doi

def openalex_get(url, params=None):
    headers = {"User-Agent": USER_AGENT}
    try:
        r = session.get(url, headers=headers, params=params, timeout=60)
        if r.status_code == 404:
            return None
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"Request error: {e}")
        return None

rows = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        data = json.loads(line)
        rows.append({
            "dblp_id": data.get("key"),
            "doi": data.get("doi"),
            "title": data.get("title"),
            "year": data.get("year"),
            "volume": data.get("volume")
        })

df = pd.DataFrame(rows)
df["year_int"] = pd.to_numeric(df["year"], errors='coerce')
df = df[df['year_int'].between(2010, 2026)].copy()
df["normalized_doi"] = df["doi"].apply(normalize_doi)

doi_to_indices = {}
for idx, doi in df["normalized_doi"].items():
    if doi:
        doi_to_indices.setdefault(doi, []).append(idx)

all_dois = list(doi_to_indices.keys())
work_by_doi = {}

print(f"Querying {len(all_dois)} DOIs...")
for i in tqdm(range(0, len(all_dois), BATCH_SIZE)):
    batch = all_dois[i:i+BATCH_SIZE]
    filt = "doi:" + "|".join(batch)
    params = {"filter": filt, "per-page": 200}
    
    data = openalex_get(f"{OPENALEX_BASE}/works", params=params)
    if data:
        for w in data.get("results", []):
            nd = normalize_doi(w.get("doi"))
            if nd:
                work_by_doi[nd] = w
    time.sleep(SLEEP_SEC)
    
# Fallback
def search_by_title(title, year):
    if not title:
        return None
    params = {"search": title.strip(), "per-page": 5}
    if year and not pd.isna(year):
        params["filter"] = f"from_publication_date:{int(year)}-01-01,to_publication_date:{int(year)}-12-31"
    
    res = openalex_get(f"{OPENALEX_BASE}/works", params=params)
    if res and res.get("results"):
        return res.get("results")[0]
    return None

missing = list(set(all_dois) - set(work_by_doi.keys()))
print(f"Falling back to title search for {len(missing)} items...")
for doi in tqdm(missing):
    idx = doi_to_indices[doi][0]
    row = df.loc[idx]
    w = search_by_title(row["title"], row["year_int"])
    if w:
        work_by_doi[doi] = w
    time.sleep(0.5)

OUT_RAW_JSONL.parent.mkdir(parents=True, exist_ok=True)
with open(OUT_RAW_JSONL, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        nd = row["normalized_doi"]
        item = {
            "dblp_id": row["dblp_id"],
            "dblp_title": row["title"],
            "dblp_year": row["year"],
            "dblp_doi": row["doi"],
            "normalized_doi": nd,
            "openalex_work": work_by_doi.get(nd),
        }
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

core_rows = []
with open(OUT_RAW_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        w = rec.get("openalex_work")
        if not w:
            continue

        loc = w.get("primary_location") or {}
        src = loc.get("source") or {}

        core_rows.append({
            "dblp_id": rec["dblp_id"],
            "dblp_title": rec["dblp_title"],
            "dblp_year": rec["dblp_year"],
            "work_id": w.get("id"),
            "display_name": w.get("display_name"),
            "cited_by_count": w.get("cited_by_count"),
            "is_oa": (w.get("open_access") or {}).get("is_oa"),
            "source_name": src.get("display_name"),
            "topics": json.dumps(w.get("topics", []), ensure_ascii=False)
        })

if core_rows:
    # pd.DataFrame(core_rows).to_parquet(OUT_CORE_PARQUET, index=False)
    print(f"Finished. Saved {len(core_rows)} records.")

Querying 2106 DOIs...


  0%|          | 0/43 [00:00<?, ?it/s]

Request error: HTTPSConnectionPool(host='api.openalex.org', port=443): Read timed out.


100%|██████████| 43/43 [02:08<00:00,  2.99s/it]


Falling back to title search for 75 items...


100%|██████████| 75/75 [01:24<00:00,  1.12s/it]


Finished. Saved 2092 records.


## Filling missing author's Affiliation Data 

using GROBID Parsed Data

In [None]:
import json
import os
import re
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()

OPENALEX_DIR = Path(os.environ["OPENALEX_DIR"])

OPENALEX_JSONL = OPENALEX_DIR /"raw"/"jocs"/"JoCS_openalex_works.jsonl"
GROBID_DIR = Path(os.getenv("JOCS_PARSED_GROBID"))
OUTPUT_FILE = Path(os.environ["JOCS_ENRICHED_OA_GROBID_DIR"]) / "JoCS_openalex_enriched.jsonl"

def normalize_name(name):
    if not name: return ""
    name = name.replace(".", " ").lower()
    return re.sub(r'\s+', ' ', name).strip()

def extract_doi_from_filename(filename):
    """
    Filename: '2023_10.1016_j.jocs.2023.102166.json'
    DOI: '10.1016/j.jocs.2023.102166'
    """
    # Regex for DOI pattern
    match = re.search(r'10\.\d{4,}_.*(?=\.json)', filename)
    if match:
        doi_part = match.group(0)
        # Pehle underscore ko slash mein badlein
        return doi_part.replace("_", "/", 1).lower().strip()
    return None

# Load GROBID data
grobid_lookup = {}
print("Loading GROBID JSON files...")
for fpath in list(GROBID_DIR.glob("*.json")):
    doi = extract_doi_from_filename(fpath.name)
    if doi:
        with open(fpath, 'r', encoding='utf-8') as f:
            grobid_lookup[doi] = json.load(f)

print(f"Loaded {len(grobid_lookup)} DOIs from GROBID.")

# Process OpenAlex
merged_results = []
authors_filled = 0
doi_matches = 0
total_papers = 0 

print("Merging data...")
with open(OPENALEX_JSONL, 'r', encoding='utf-8') as f_in:
    for line in f_in:
        record = json.loads(line)
        total_papers += 1
        doi = record.get("normalized_doi", "").lower().strip()
        oa_work = record.get("openalex_work")

        # DOI Match Check
        if oa_work and doi in grobid_lookup:
            doi_matches += 1
            grobid_data = grobid_lookup[doi]
            authorships = oa_work.get("authorships", [])

            for auth_obj in authorships:
                # Initialize for matching papers
                auth_obj["raw_pdf_affiliation"] = None

                # Gap check
                has_oa = any([
                    len(auth_obj.get("institutions", [])) > 0,
                    len(auth_obj.get("raw_affiliation_strings", [])) > 0,
                    len(auth_obj.get("affiliations", [])) > 0
                ])

                if has_oa: continue 

                # Name Matching
                oa_disp = normalize_name(auth_obj.get("author", {}).get("display_name", ""))
                oa_raw = normalize_name(auth_obj.get("raw_author_name", ""))
                
                for g_auth in grobid_data.get("authors", []):
                    g_name = normalize_name(g_auth.get("name", ""))
                    if (oa_disp == g_name or oa_raw == g_name or oa_disp in g_name or g_name in oa_disp):
                        affils = g_auth.get("affiliations", [])
                        auth_obj["raw_pdf_affiliation"] = " ; ".join(affils) if affils else "Empty in PDF"
                        authors_filled += 1
                        break
        
        merged_results.append(record)

OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
    for res in merged_results:
        f_out.write(json.dumps(res, ensure_ascii=False) + "\n")

print(f"DOI Matches found: {doi_matches}/{total_papers}")
print(f"Gaps filled in 'raw_pdf_affiliation': {authors_filled}")

Loading GROBID JSON files...
Loaded 2095 DOIs from GROBID.
Merging data...
DOI Matches found: 2091/2106
Gaps filled in 'raw_pdf_affiliation': 710


## Audit

CSV of missing authors

In [None]:
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

RAW_PATH = Path(os.getenv("JOCS_ENRICHED_OA_GROBID"))
OUTPUT_CSV = Path(r"D:\ITMO Big Data & ML School\semester 3\ri3_repo\data\data_files\openalex\interim\jocs\jocs_all_years_missing_authors.csv")

missing_data = []

if not RAW_PATH.exists():
    print(f"Error: {RAW_PATH} not found.")
else:
    total_authors = 0
    missing_authors = 0
    papers_with_missing = 0

    with open(RAW_PATH, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue

            rec = json.loads(line)
            oa_work = rec.get("openalex_work")
            if not oa_work:
                continue

            authorships = oa_work.get("authorships", [])
            missing_authors_list = []
            missing_count = 0

            for auth in authorships:
                total_authors += 1

                institutions = auth.get("institutions", []) or []
                raw_aff = auth.get("raw_affiliation_strings", []) or []
                affils = auth.get("affiliations", []) or []
                pdf_aff = auth.get("raw_pdf_affiliation")

                bad_pdf_vals = {
                    None,
                    "Not found in PDF",
                    "Affiliation listed but empty in PDF",
                    "Empty in PDF",
                    0,
                }

                has_any = (
                    len(institutions) > 0
                    or len(raw_aff) > 0
                    or len(affils) > 0
                    or (pdf_aff not in bad_pdf_vals)
                )

                if not has_any:
                    missing_authors += 1
                    missing_count += 1
                    author_name = (
                        auth.get("author", {}).get("display_name") 
                        or auth.get("raw_author_name", "Unknown")
                    )
                    missing_authors_list.append(author_name)

            if missing_count > 0:
                papers_with_missing += 1
                doi = rec.get("normalized_doi") or rec.get("dblp_doi") or "NO_DOI"
                title = rec.get("dblp_title", "No title")
                
                missing_data.append({
                    "year": rec.get("dblp_year"),
                    "doi": doi,
                    "title": title,
                    "missing_authors": "; ".join(missing_authors_list),
                    "total_authors": len(authorships),
                    "missing_count": missing_count
                })

    print("AUTHOR AFFILIATION AUDIT")
    print(f"Total authors seen:          {total_authors:,}")
    print(f"Authors with NO affiliation: {missing_authors:,}")

    df = pd.DataFrame(missing_data)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\nExported {len(missing_data)} papers to {OUTPUT_CSV}")
    print(f"Total missing authors: {df['missing_count'].sum()}")


AUTHOR AFFILIATION AUDIT
Total authors seen:          7,716
Authors with NO affiliation: 43

Exported 16 papers to D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\interim\jocs\jocs_all_years_missing_authors.csv
Total missing authors: 43


#### Removing 2026 papers as they do not have any openalex data

Final File

In [None]:
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import json

load_dotenv()

CSV_FILE = Path(r"D:\ITMO Big Data & ML School\semester 3\ri3_repo\data\data_files\openalex\interim\jocs\jocs_all_years_missing_authors.csv")

ENRICHED_JSONL = Path(os.getenv("JOCS_ENRICHED_OA_GROBID"))
FINAL_JSONL = Path(os.getenv("JOCS_FINAL"))

EXCLUDE_DOIS = {"10.1016/j.jocs.2024.102462"}

df_missing = pd.read_csv(CSV_FILE)
bad_dois = set(df_missing[~df_missing["doi"].isin(EXCLUDE_DOIS)]["doi"])

complete_records = []
proceedings_count = 0
oa_null_count = 0
total_papers = 0

with open(ENRICHED_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip(): continue
            
        rec = json.loads(line)
        total_papers += 1
        doi = rec.get("normalized_doi") or rec.get("dblp_doi")
        
        # Filter proceedings
        if doi in bad_dois and doi not in EXCLUDE_DOIS:
            proceedings_count += 1
            continue
            
        # Filter OpenAlex null
        if rec.get("openalex_work") is None:
            oa_null_count += 1
            continue
            
        complete_records.append(rec)

with open(FINAL_JSONL, "w", encoding="utf-8") as f:
    for rec in complete_records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"JOCS Final Dataset: {len(complete_records)}")
print(f"Removed: {proceedings_count} proceedings, {oa_null_count} nulls")


JOCS Final Dataset: 2077
Removed: 15 proceedings, 14 nulls


Title + Keywords + abstract

Now checking Grobid Data to check the trash data

In [3]:
import os
import json
import csv
import sys
from pathlib import Path
from dotenv import load_dotenv

# Initialize environment variables
load_dotenv()

# Configuration Constants
PARSED_PATH = Path(os.getenv("JOCS_PARSED_GROBID", ""))
OUTPUT_CSV = "jocs_trash_files_only.csv"

def process_metadata_health(folder_path):
    """
    Analyzes JSON metadata completeness and exports empty records to CSV.
    
    Args:
        folder_path (Path): Path to the directory containing parsed JSON files.
    """
    if not folder_path.exists():
        print(f"Error: Directory not found at {folder_path}")
        sys.exit(1)

    # Initialize statistics for the 8 identified metadata situations
    # Logic: (Has Title, Has Keywords, Has Abstract)
    stats = {
        (True, True, True): 0,    # Sit 1: Perfect
        (False, True, True): 0,   # Sit 2: No Title
        (True, False, True): 0,   # Sit 3: No Keywords
        (True, True, False): 0,   # Sit 4: No Abstract
        (False, False, True): 0,  # Sit 5: Only Abstract
        (False, True, False): 0,  # Sit 6: Only Keywords
        (True, False, False): 0,  # Sit 7: Only Title
        (False, False, False): 0  # Sit 8: Trash
    }

    csv_records = []
    total_processed = 0

    print(f"Starting analysis in: {folder_path}\n")

    for file_path in folder_path.glob("*.json"):
        total_processed += 1
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except (json.JSONDecodeError, IOError) as e:
            print(f"Warning: Could not process {file_path.name} - {e}")
            continue

        # Extract and validate fields
        raw_title = data.get('title', "")
        clean_title = str(raw_title).replace('\n', ' ').strip()
        
        has_title = bool(clean_title)
        has_keywords = bool(data.get('keywords'))
        has_abstract = bool(str(data.get('abstract') or "").strip())

        # Update statistical counters
        situation_key = (has_title, has_keywords, has_abstract)
        if situation_key in stats:
            stats[situation_key] += 1

        # Collect data for Situation 8 CSV export
        if not has_title and not has_keywords and not has_abstract:
            csv_records.append({
                "filename": file_path.name,
                "extracted_title": clean_title if clean_title else "EMPTY / NOT PARSED",
                "status": "Situation 8 (Full Empty)"
            })

    # Generate Console Report Table
    print_health_report(stats, total_processed)

    # Export CSV for Situation 8
    if csv_records:
        export_to_csv(csv_records, OUTPUT_CSV)
    else:
        print("No files matching Situation 8 criteria were found for export.")

def print_health_report(stats, total):
    """Prints a formatted summary table of metadata health situations."""
    header = f"{'Sit.':<5} | {'Title':<8} | {'Keywords':<10} | {'Abstract':<10} | {'Count':<8} | {'Status'}"
    separator = "-" * len(header)
    
    print("\nMETADATA HEALTH REPORT")
    print(separator)
    print(header)
    print(separator)

    rows = [
        (1, True, True, True, "PERFECT"),
        (2, False, True, True, "FIXABLE (No Title)"),
        (3, True, False, True, "COMMON (No Keywords)"),
        (4, True, True, False, "REVIEW (No Abstract)"),
        (5, False, False, True, "BAD (Only Abstract)"),
        (6, False, True, False, "BAD (Only Keywords)"),
        (7, True, False, False, "BAD (Only Title)"),
        (8, False, False, False, "TRASH (Empty)")
    ]

    for sit, t, k, a, status in rows:
        t_marker = "[OK]" if t else "[MISSING]"
        k_marker = "[OK]" if k else "[MISSING]"
        a_marker = "[OK]" if a else "[MISSING]"
        count = stats.get((t, k, a), 0)
        print(f"{sit:<5} | {t_marker:<8} | {k_marker:<10} | {a_marker:<10} | {count:<8} | {status}")

    print(separator)
    print(f"TOTAL FILES PROCESSED: {total}\n")

def export_to_csv(records, filename):
    """Writes identified empty files to a CSV for cleanup audit."""
    keys = ["filename", "extracted_title", "status"]
    try:
        with open(filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(records)
        print(f"Successfully exported {len(records)} records to {filename}")
        print(f"Full Path: {os.path.abspath(filename)}")
    except IOError as e:
        print(f"Error writing CSV: {e}")

if __name__ == "__main__":
    process_metadata_health(PARSED_PATH)

Starting analysis in: D:\ITMO Big Data & ML School\semester 3\RI3\parsed\jocs_grobid\all


METADATA HEALTH REPORT
--------------------------------------------------------------
Sit.  | Title    | Keywords   | Abstract   | Count    | Status
--------------------------------------------------------------
1     | [OK]     | [OK]       | [OK]       | 1987     | PERFECT
2     | [MISSING] | [OK]       | [OK]       | 5        | FIXABLE (No Title)
3     | [OK]     | [MISSING]  | [OK]       | 36       | COMMON (No Keywords)
4     | [OK]     | [OK]       | [MISSING]  | 0        | REVIEW (No Abstract)
5     | [MISSING] | [MISSING]  | [OK]       | 12       | BAD (Only Abstract)
6     | [MISSING] | [OK]       | [MISSING]  | 2        | BAD (Only Keywords)
7     | [OK]     | [MISSING]  | [MISSING]  | 0        | BAD (Only Title)
8     | [MISSING] | [MISSING]  | [MISSING]  | 53       | TRASH (Empty)
--------------------------------------------------------------
TOTAL FILES PROCESSED: 2095

Successfully 

Removinf the 53 Trash files form the openalex final data<br>
As these are editorial, preface or an update of a paper

In [6]:
import json
import pandas as pd
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

# Path Configuration
INPUT_CSV = "jocs_trash_files_only.csv"
INPUT_JSONL = Path(os.getenv("JOCS_FINAL"))
OUTPUT_JSONL = INPUT_JSONL.parent / "final_jocs_openalex.jsonl"

def normalize_doi_from_filename(filename):
    """
    Converts filename format to normalized DOI format.
    Example: '2010_10.1016_j.jocs.2010.04.003.json' -> '10.1016/j.jocs.2010.04.003'
    """
    # Remove file extension
    name_no_ext = os.path.splitext(filename)[0]
    
    if '_' in name_no_ext:
        # Split at the first underscore to remove the year prefix
        _, doi_part = name_no_ext.split('_', 1)
        # Replace the first underscore in the remaining string with a forward slash
        normalized_doi = doi_part.replace('_', '/', 1)
        return normalized_doi
    
    return name_no_ext

def generate_final_dataset():
    """
    Filters the input JSONL file by removing records present in the exclusion CSV.
    Writes the resulting clean data to a new JSONL file.
    """
    if not os.path.exists(INPUT_CSV):
        print(f"Error: Required exclusion file '{INPUT_CSV}' not found.")
        sys.exit(1)

    if not INPUT_JSONL.exists():
        print(f"Error: Input dataset '{INPUT_JSONL}' not found.")
        sys.exit(1)

    # Load exclusion list into a set for O(1) lookup performance
    try:
        df_trash = pd.read_csv(INPUT_CSV)
        exclusion_set = set(df_trash['filename'].apply(normalize_doi_from_filename))
        print(f"Exclusion set initialized with {len(exclusion_set)} DOIs.")
    except Exception as e:
        print(f"Error reading exclusion CSV: {e}")
        sys.exit(1)

    records_removed = 0
    records_retained = 0

    # Stream process the JSONL to handle large data volumes
    try:
        with open(INPUT_JSONL, 'r', encoding='utf-8') as f_in, \
             open(OUTPUT_JSONL, 'w', encoding='utf-8') as f_out:
            
            for line_number, line in enumerate(f_in, 1):
                try:
                    record = json.loads(line)
                    doi = record.get('normalized_doi', "").strip()
                    
                    if doi in exclusion_set:
                        records_removed += 1
                        continue
                    
                    # Write clean records back to the new file
                    f_out.write(json.dumps(record, ensure_ascii=False) + '\n')
                    records_retained += 1
                except json.JSONDecodeError:
                    print(f"Warning: Skipping malformed JSON on line {line_number}")
                    continue

        print(f"Output File:      {OUTPUT_JSONL.name}")
        print(f"Records Excluded: {records_removed}")
        print(f"Records Retained: {records_retained}")

    except Exception as e:
        print(f"An error occurred during file processing: {e}")

if __name__ == "__main__":
    generate_final_dataset()

Exclusion set initialized with 53 DOIs.
Output File:      final_jocs_openalex.jsonl
Records Excluded: 40
Records Retained: 2037


In [None]:
import json
from difflib import SequenceMatcher

def string_similarity(a, b):
    # Normalize: lowercase and strip spaces
    a = str(a).lower().strip()
    b = str(b).lower().strip()
    return SequenceMatcher(None, a, b).ratio()

input_file = r"D:\ITMO Big Data & ML School\semester 3\ri3_repo\data\data_files\openalex\processed\jocs\final_jocs_openalex.jsonl"
mismatch_count = 0
total_records = 0
mismatch_examples = []

print("Scanning for Title Mismatches...")

with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        total_records += 1
        record = json.loads(line)
        
        # 1. DBLP Title
        dblp_t = record.get('dblp_title', '')
        
        # 2. OpenAlex Title
        oa_work = record.get('openalex_work', {})
        oa_t = oa_work.get('title', '') if oa_work else ""
        
        # Similarity Score
        score = string_similarity(dblp_t, oa_t)
        
        # Agar similarity 80% se kam hai toh masla hai
        if score < 0.8:
            mismatch_count += 1
            if len(mismatch_examples) < 5: # Top 5 examples for debugging
                mismatch_examples.append({
                    "doi": record.get('dblp_doi'),
                    "dblp": dblp_t,
                    "openalex": oa_t,
                    "score": round(score, 2)
                })

print(f"Total Records Scanned: {total_records}")
print(f"Mismatched Records Found: {mismatch_count}")

if mismatch_examples:
    print("\nSample Mismatches (First 5):")
    for ex in mismatch_examples:
        print(f"DOI: {ex['doi']}")
        print(f"  DBLP: {ex['dblp']}")
        print(f"  O.A.: {ex['openalex']}")

Scanning for Title Mismatches...
Total Records Scanned: 2037
Mismatched Records Found: 29

Sample Mismatches (First 5):
DOI: 10.1016/J.JOCS.2024.102471
  DBLP: POD-Galerkin reduced order model coupled with neural networks to solve flow in porous media.
  O.A.: Accelerating phase field simulations through a hybrid adaptive Fourier neural operator with U-net backbone
DOI: 10.1016/J.JOCS.2024.102492
  DBLP: Analytical and numerical methods for the solution to the rigid punch contact integral equations.
  O.A.: Thorough investigation of exact wave solutions in nonlinear thermoelasticity theory under the influence of gravity using advanced analytical methods
DOI: 10.1016/J.JOCS.2025.102705
  DBLP: Efficient numerical simulation of variable-order fractional diffusion processes with a memory kernel.
  O.A.: Influence of moving heat sources on thermoviscoelastic behavior of rotating nanorods: a nonlocal Klein–Gordon perspective with fractional heat conduction
DOI: 10.1016/J.JOCS.2024.102496
  

### Deleting this Mismatch, came from OpenALex

In [None]:
import json
from difflib import SequenceMatcher
import os

def string_similarity(a, b):
    # Standard normalization for comparison
    a = str(a).lower().strip()
    b = str(b).lower().strip()
    if not a or not b: return 0
    return SequenceMatcher(None, a, b).ratio()

# Paths
input_file = r"D:\ITMO Big Data & ML School\semester 3\ri3_repo\data\data_files\openalex\processed\jocs\final_jocs_openalex.jsonl"
output_file = r"D:\ITMO Big Data & ML School\semester 3\ri3_repo\data\data_files\openalex\processed\jocs\gold_jocs_clean.jsonl"

total_records = 0
retained_records = 0
deleted_dois = []

print("Starting Professional Purge...")

with open(input_file, 'r', encoding='utf-8') as f_in, \
     open(output_file, 'w', encoding='utf-8') as f_out:
    
    for line in f_in:
        total_records += 1
        record = json.loads(line)
        
        # Extract titles
        dblp_t = record.get('dblp_title', '')
        oa_work = record.get('openalex_work', {})
        oa_t = oa_work.get('title', '') if isinstance(oa_work, dict) else ""
        
        # Strict Similarity Check
        score = string_similarity(dblp_t, oa_t)
        
        if score >= 0.8:
            f_out.write(json.dumps(record) + "\n")
            retained_records += 1
        else:
            deleted_dois.append(record.get('dblp_doi'))

print(f"Analysis Complete.")
print(f"Total Scanned:   {total_records}")
print(f"Purged Records:  {len(deleted_dois)}")
print(f"Retained Gold:   {retained_records}")

Starting Professional Purge...
Analysis Complete.
Total Scanned:   2037
Purged Records:  29
Retained Gold:   2008
