# Data Preparation: Synthetic Greenwashing Claims via GPT-4o

## Overview
Instead of using rigid templates, we use **GPT-4o** to generate high-quality synthetic training data. 
This provides linguistic diversity and mimics real-world financial reporting styles more accurately.

## Goal
Generate a dataset of 600+ labeled examples:
1. **Specific (Label 1):** Claims with clear numbers, baselines, dates, and concrete units.
2. **Vague (Label 0):** Claims using aspirational language ("aim", "strive", "commit") without verifiable targets.

In [None]:
import pandas as pd
import os
import time
from openai import OpenAI
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from getpass import getpass

# Setup Output Directory
OUTPUT_DIR = "../inputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Libraries loaded.")

In [None]:
api_key = getpass("Enter your OpenAI API Key: ")
client = OpenAI(api_key=api_key)

In [None]:
def generate_batch(label_type, n=10):
    """
    Generates a list of synthetic sentences for a specific class (Specific or Vague).
    
    Args:
        label_type (str): Either "Specific" or "Vague".
        n (int): Number of sentences to generate in this batch.
        
    Returns:
        list: A list of strings, where each string is a generated sentence.
    """
    
    if label_type == "Specific":
        prompt_content = (
            f"Generate {n} distinct sentences typical of corporate sustainability reports. "
            "These must be SPECIFIC (Label 1). "
            "They must contain quantitative targets, exact years, baselines, or specific monetary amounts. "
            "Examples: 'Reduce Scope 1 emissions by 40% by 2030 vs 2019.', 'Invested 50 million EUR in solar parks.' "
            "Return only the sentences, one per line, with no numbering or bullet points."
        )
    else:
        prompt_content = (
            f"Generate {n} distinct sentences typical of corporate sustainability reports. "
            "These must be VAGUE (Label 0). "
            "Use hedging language like 'aim', 'strive', 'intend', 'hope', or 'committed to exploring'. "
            "Avoid any concrete numbers, percentages, or deadlines. "
            "Examples: 'We aim to reduce our carbon footprint.', 'We are committed to exploring renewable solutions.' "
            "Return only the sentences, one per line, with no numbering or bullet points."
        )

    try:
        # Call the OpenAI API
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert financial data generator."},
                {"role": "user", "content": prompt_content}
            ],
            temperature=0.9  # Higher temperature adds more variety
        )
        
        # Process the response text
        raw_text = response.choices[0].message.content.strip()
        # Split by newlines and remove empty lines
        sentences = [line.strip() for line in raw_text.split('\n') if line.strip()]
        return sentences
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Quick test to verify it works
print("Testing generation with 3 specific examples:")
test_sentences = generate_batch("Specific", n=3)
for s in test_sentences:
    print(f"- {s}")

In [None]:
# Configuration for the full dataset generation
TOTAL_SAMPLES = 600
BATCH_SIZE = 20  # generating 20 sentences at a time
batches_per_class = (TOTAL_SAMPLES // 2) // BATCH_SIZE

all_data = []

print(f"Starting generation of {TOTAL_SAMPLES} samples...")

# 1. Generate Specific Claims (Label 1)
for _ in tqdm(range(batches_per_class), desc="Generating Specific Claims"):
    sentences = generate_batch("Specific", BATCH_SIZE)
    for s in sentences:
        all_data.append({"text": s, "label": 1})
    # short pause for API-limits
    time.sleep(0.5)

# 2. Generate Vague Claims (Label 0)
for _ in tqdm(range(batches_per_class), desc="Generating Vague Claims"):
    sentences = generate_batch("Vague", BATCH_SIZE)
    for s in sentences:
        all_data.append({"text": s, "label": 0})
    time.sleep(0.5)

# Convert to a Pandas DataFrame
df = pd.DataFrame(all_data)

# Shuffle the dataset so specific and vague examples are mixed
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Successfully generated {len(df)} samples.")
display(df.head())

In [None]:
# Split data into training (80%) and evaluation (20%) sets
# Using 'stratify' to ensure both sets have an equal balance of Specific and Vague examples
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Save the datasets to CSV files
train_path = os.path.join(OUTPUT_DIR, "train_synthetic.csv")
eval_path = os.path.join(OUTPUT_DIR, "eval_synthetic.csv")

train_df.to_csv(train_path, index=False)
eval_df.to_csv(eval_path, index=False)

print(f"Training data saved to: {train_path}")
print(f"Evaluation data saved to: {eval_path}")