In [None]:
import os
import pandas as pd
import time
from openai import OpenAI
from tqdm import tqdm

# Load OpenAI API key from environment variables
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# --------------------------------------------------
# 1. Load Data
# --------------------------------------------------
file_path = "./goldStandardPapers/standards_v2_140125.csv"
output_file = "reclassified_papers_checkpoint.csv"
checkpoint_interval = 50  # Save progress every N papers

df = pd.read_csv(file_path)

# Extract unique labels for classification
unique_catalysis_types = df["catalysis_type"].dropna().unique().tolist()
unique_application_themes = df["application_theme"].dropna().unique().tolist()

# Remove 'unknown' if it exists
if "unknown" in unique_catalysis_types:
    unique_catalysis_types.remove("unknown")

if "unknown" in unique_application_themes:
    unique_application_themes.remove("unknown")

# Check if checkpoint exists
if os.path.exists(output_file):
    print(f"🔄 Resuming from checkpoint: {output_file}")
    df_checkpoint = pd.read_csv(output_file)
    processed_indices = df_checkpoint.dropna(subset=["new_catalysis_type"]).index
else:
    df_checkpoint = df.copy()
    df_checkpoint["new_catalysis_type"] = None
    df_checkpoint["new_application_theme"] = None
    processed_indices = []

# --------------------------------------------------
# 2. OpenAI Prompting Function
# --------------------------------------------------
def query_openai_classification(title, abstract):
    """
    Query OpenAI API to classify the catalysis type and application theme based on title and abstract.
    """
    if pd.isna(title) and pd.isna(abstract):
        return "unknown", "unknown"  # Skip if no information is available
    
    title = title if pd.notna(title) else ""
    abstract = abstract if pd.notna(abstract) else ""

    prompt = f"""
    You are a scientific classification assistant.
    Given the title and abstract of a scientific paper, classify it into the most appropriate **Catalysis Type** and **Application Theme**.

    ### Instructions:
    - You **must** choose one **Catalysis Type** from this list: {', '.join(unique_catalysis_types)}.
    - You **must** choose one **Application Theme** from this list: {', '.join(unique_application_themes)}.
    - If the current classification is 'unknown', do your best to place it into one of the known categories.
    - Do **not** make up new categories.

    ### Input Paper:
    **Title:** {title}
    **Abstract:** {abstract}

    ### Output Format:
    - Catalysis Type: [Chosen Type]
    - Application Theme: [Chosen Theme]
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.3
    )
    
    content = response.choices[0].message.content.strip()

    # Extract classified categories
    catalysis_type = "unknown"
    application_theme = "unknown"

    if "Catalysis Type:" in content:
        catalysis_type = content.split("Catalysis Type:")[1].split("\n")[0].strip()

    if "Application Theme:" in content:
        application_theme = content.split("Application Theme:")[1].split("\n")[0].strip()

    # Ensure classification falls within valid categories
    if catalysis_type not in unique_catalysis_types:
        catalysis_type = "unknown"

    if application_theme not in unique_application_themes:
        application_theme = "unknown"

    return catalysis_type, application_theme

# --------------------------------------------------
# 3. Process Each Paper and Save Checkpoints
# --------------------------------------------------
progress_bar = tqdm(total=len(df), desc="Classifying Papers")

for index, row in df.iterrows():
    if index in processed_indices:
        progress_bar.update(1)
        continue  # Skip already processed rows
    
    title = row["title"]
    abstract = row["abstract"]
    
    new_catalysis, new_theme = query_openai_classification(title, abstract)
    
    df_checkpoint.at[index, "new_catalysis_type"] = new_catalysis
    df_checkpoint.at[index, "new_application_theme"] = new_theme

    progress_bar.set_postfix({"Last Processed": f"{title[:50]}...", "Catalysis": new_catalysis, "Theme": new_theme})
    progress_bar.update(1)
    
    # Save checkpoint every N iterations
    if (index + 1) % checkpoint_interval == 0:
        df_checkpoint.to_csv(output_file, index=False)
        print(f"💾 Checkpoint saved at {index + 1} papers.")

# Final save
df_checkpoint.to_csv(output_file, index=False)
progress_bar.close()

print(f"\n✅ Classification complete! Results saved to '{output_file}'.")

🔄 Resuming from checkpoint: reclassified_papers_checkpoint.csv


Classifying Papers:  46%|████▌     | 1782/3854 [01:23<01:36, 21.40it/s, Last Processed=Converting surface-oxidized cobalt phosphides into..., Catalysis=electro, Theme=water]
Classifying Papers:  47%|████▋     | 1799/3854 [00:47<01:31, 22.50it/s, Last Processed=Graphene, related two-dimensional crystals, and hy..., Catalysis=photo, Theme=water]            

💾 Checkpoint saved at 1800 papers.


Classifying Papers:  48%|████▊     | 1849/3854 [01:37<11:21,  2.94it/s, Last Processed=Statistical Evaluation of HTS Assays for Enzymatic..., Catalysis=bio, Theme=enzyme]             

💾 Checkpoint saved at 1850 papers.


Classifying Papers:  49%|████▉     | 1900/3854 [02:25<30:28,  1.07it/s, Last Processed=Recent Advances in MOF-based Nanocatalysts for Pho..., Catalysis=photo, Theme=co2 utilisation]  

💾 Checkpoint saved at 1900 papers.


Classifying Papers:  51%|█████     | 1950/3854 [03:16<24:48,  1.28it/s, Last Processed=Nanoparticles as Emerging Labels in Electrochemica..., Catalysis=electro, Theme=enzyme]        

💾 Checkpoint saved at 1950 papers.


Classifying Papers:  52%|█████▏    | 2000/3854 [04:06<22:56,  1.35it/s, Last Processed=Formation of Polarized, Functional Artificial Cell..., Catalysis=bio, Theme=enzyme]                   

💾 Checkpoint saved at 2000 papers.


Classifying Papers:  53%|█████▎    | 2050/3854 [04:52<35:12,  1.17s/it, Last Processed=Aligning Electronic and Protonic Energy Levels of ..., Catalysis=photo, Theme=water]                  

💾 Checkpoint saved at 2050 papers.


Classifying Papers:  54%|█████▍    | 2100/3854 [05:36<32:23,  1.11s/it, Last Processed=A Unified Treatment of the Relationship Between Li..., Catalysis=homo, Theme=enzyme]            

💾 Checkpoint saved at 2100 papers.


Classifying Papers:  56%|█████▌    | 2150/3854 [06:19<20:26,  1.39it/s, Last Processed=An Isolable Phosphaethynolatoborane and Its Reacti..., Catalysis=not_catalysis, Theme=unknown]  

💾 Checkpoint saved at 2150 papers.


Classifying Papers:  57%|█████▋    | 2200/3854 [07:06<31:41,  1.15s/it, Last Processed=Creation of Al‐Enriched Mesoporous ZSM‐5 Nanoboxes..., Catalysis=hetero, Theme=biomass]        

💾 Checkpoint saved at 2200 papers.


Classifying Papers:  58%|█████▊    | 2250/3854 [07:52<23:09,  1.15it/s, Last Processed=H<sub>2</sub>‐free Synthesis of Aromatic, Cyclic a..., Catalysis=hetero, Theme=co2 utilisation]       

💾 Checkpoint saved at 2250 papers.


Classifying Papers:  60%|█████▉    | 2300/3854 [08:46<44:54,  1.73s/it, Last Processed=Double emulsion production in glass capillary micr..., Catalysis=not_catalysis, Theme=co2 utilisation]

💾 Checkpoint saved at 2300 papers.


Classifying Papers:  61%|██████    | 2350/3854 [09:34<19:50,  1.26it/s, Last Processed=Droplet Interfaced Parallel and Quantitative Micro..., Catalysis=not_catalysis, Theme=enzyme]         

💾 Checkpoint saved at 2350 papers.


Classifying Papers:  62%|██████▏   | 2400/3854 [10:19<18:26,  1.31it/s, Last Processed=An Investigation into the Stability of Graphitic C..., Catalysis=photo, Theme=co2 utilisation]        

💾 Checkpoint saved at 2400 papers.


Classifying Papers:  64%|██████▎   | 2450/3854 [10:58<20:13,  1.16it/s, Last Processed=Strongly Enhanced Antibacterial Action of Copper O..., Catalysis=hetero, Theme=unknown]               

💾 Checkpoint saved at 2450 papers.


Classifying Papers:  65%|██████▍   | 2500/3854 [11:41<21:58,  1.03it/s, Last Processed=Expanding the Color Space in the Two-Color Heterog..., Catalysis=photo, Theme=biomass]          

💾 Checkpoint saved at 2500 papers.


Classifying Papers:  65%|██████▌   | 2519/3854 [11:59<15:35,  1.43it/s, Last Processed=Unraveling the Role of Lithium in Enhancing the Hy..., Catalysis=hetero, Theme=water]           