In [None]:
from openai import OpenAI
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, classification_report

In [None]:
# Initial OpenAI client
openai_api_key= "Add Your OpenAI API KEY Here."

client = OpenAI(api_key=openai_api_key)

def LCI_title_classification(df):
    """Classify each table title as 'LCI inventory table' or not (Yes/No)"""

    responses = []

    for _, row in df.iterrows():
        context = row["Table_title"]

        # --- System role ---
        system_msg = (
            "You are an expert in Life Cycle Assessment (LCA). "
            "Determine whether the given context represents a life cycle inventory "
            "(LCI) or input-output table. Respond strictly with only 'Yes' or 'No'."
        )

        # --- Few-shot examples ---
        examples = """
        Context:
        Table S9 The inventory data for PV/CCU-CH3OH technical route.
        Answer: Yes

        Context:
        Table S1 details the input-output data of the four ethylene glycol production routes.
        Answer: Yes

        Context:
        Table 1. The plant-level mass and energy balances for manufacturing ethylene.
        Answer: Yes

        Context:
        Table 4 Summary of the main economic indicators of the production process.
        Answer: No

        Context:
        Table 2. Comparison of the environmental impact categories for different production scenarios.
        Answer: No

        Only report "Yes" or "No" â€” do not include explanations.
        Question: Given the table title and context, is it an LCI inventory table?
        """

        query = f"""
        Context:
        {context}
        Answer strictly with "Yes" or "No".
        """

        try:
            response = client.chat.completions.create(
                model="gpt-4o",  
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": examples + "\n\n" + query},
                ],
                temperature=0.0,
            )

            answer = response.choices[0].message.content.strip()

            if "yes" in answer.lower():
                answer = "Yes"
            elif "no" in answer.lower():
                answer = "No"
            else:
                answer = "No"

        except Exception as e:
            print(f"Error processing '{context[:50]}...': {e}")
            answer = "No"

        responses.append(answer)

    df["LLM_prediction"] = responses
    return df

# Load the dataset
df = pd.read_csv("training_data.csv")

# Rename label column
df = df.rename(columns={'is LCI inventory table?': 'label'})
df['label'] = df['label'].map({'Yes': 'Yes', 'No': 'No'})

# Split the dataset for 80% train and 20% test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Run model on test set
classified_df = LCI_title_classification(test_df)

# Evaluate
y_true = classified_df['label']
y_pred = classified_df['LLM_prediction']

precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1-score:  {f1:.3f}")

print("\n=== Detailed Report ===")
print(classification_report(y_true, y_pred, digits=3))