# Notebook 1: Synthetic Data Generation

## Overview
This notebook generates synthetic training data for greenwashing detection using GPT-4o.

We create two classes:
- **Specific (Label 1)**: Concrete claims with numbers, dates, and measurable targets
- **Vague (Label 0)**: General statements with hedging words and no concrete commitments

## Output
- `train_synthetic.csv` - 80% of data for training
- `eval_synthetic.csv` - 20% of data for evaluation

In [None]:
import os
import time
import pandas as pd
from openai import OpenAI
from sklearn.model_selection import train_test_split
from tqdm import tqdm

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
OUTPUT_DIR = "../inputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Libraries loaded successfully.")

## Configuration

In [None]:
# Generation parameters
SAMPLES_PER_CLASS = 300
BATCH_SIZE = 20

# Prompts for each class
PROMPTS = {
    "Specific": """
Generate 20 example sentences that asset managers might write in sustainability reports.
These should be SPECIFIC claims containing:
- Concrete numbers (percentages, amounts)
- Specific dates or deadlines (e.g., 'by 2030')
- Measurable units (tons CO2, GWh, EUR million)

Examples:
- 'Reduce carbon emissions by 50% by 2030 compared to 2020 baseline.'
- 'Invested EUR 2.5 billion in renewable energy projects in 2023.'

Return only the sentences, one per line, no numbering.
""",
    "Vague": """
Generate 20 example sentences that asset managers might write in sustainability reports.
These should be VAGUE claims containing:
- Hedging words (may, might, aim, intend, seek)
- General aspirations without concrete targets
- No specific numbers or dates

Examples:
- 'We aim to enhance our sustainability practices over time.'
- 'Our goal is to become a leader in responsible investing.'

Return only the sentences, one per line, no numbering.
"""
}

print(f"Target: {SAMPLES_PER_CLASS * 2} total samples")
print(f"Batch size: {BATCH_SIZE} sentences per API call")

## Data Generation

We use GPT-4o to generate synthetic sentences in batches.

In [None]:
def generate_batch(class_type, batch_size):
    """
    Generate a batch of sentences using GPT-4o.
    
    Args:
        class_type: 'Specific' or 'Vague'
        batch_size: Number of sentences to generate
    
    Returns:
        List of generated sentences
    """
    prompt = PROMPTS[class_type]
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in corporate sustainability reporting."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.9
    )
    
    text = response.choices[0].message.content.strip()
    sentences = [line.strip() for line in text.split('\n') if line.strip()]
    
    return sentences

print("Generation function ready.")

In [None]:
# Calculate number of batches needed
batches_per_class = (SAMPLES_PER_CLASS + BATCH_SIZE - 1) // BATCH_SIZE
all_data = []

print(f"Starting generation of {SAMPLES_PER_CLASS * 2} samples...")

# Generate Specific Claims (Label 1)
for _ in tqdm(range(batches_per_class), desc="Generating Specific Claims"):
    sentences = generate_batch("Specific", BATCH_SIZE)
    for s in sentences:
        all_data.append({"text": s, "label": 1})
    time.sleep(0.5)

# Generate Vague Claims (Label 0)
for _ in tqdm(range(batches_per_class), desc="Generating Vague Claims"):
    sentences = generate_batch("Vague", BATCH_SIZE)
    for s in sentences:
        all_data.append({"text": s, "label": 0})
    time.sleep(0.5)

# Convert to DataFrame
df = pd.DataFrame(all_data)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Successfully generated {len(df)} samples.")

## Data Quality Validation

Verify that the generated data meets our quality standards.

In [None]:
print("="*60)
print("DATA QUALITY INSPECTION")
print("="*60)

# Show random samples from each class
print("\nSPECIFIC Claims (Label 1) - Should have numbers/dates/concrete targets:")
specific_samples = df[df['label'] == 1].sample(5)
for idx, row in specific_samples.iterrows():
    print(f"  - {row['text']}")

print("\nVAGUE Claims (Label 0) - Should have hedging words:")
vague_samples = df[df['label'] == 0].sample(5)
for idx, row in vague_samples.iterrows():
    print(f"  - {row['text']}")

# Check class balance
print(f"\nClass Distribution:")
print(df['label'].value_counts())
balance_ratio = df['label'].value_counts()[0] / df['label'].value_counts()[1]
print(f"Balance ratio: {balance_ratio:.2f}:1")

# Check for duplicates
duplicates = df.duplicated(subset=['text']).sum()
print(f"\nDuplicates found: {duplicates}")
if duplicates > 0:
    print("Removing duplicates...")
    df = df.drop_duplicates(subset=['text'])
    print(f"Dataset size after deduplication: {len(df)}")

## Train/Test Split

Split data into training (80%) and evaluation (20%) sets with stratification.

In [None]:
# Split with stratification to maintain class balance
train_df, eval_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']
)

# Save to CSV
train_path = os.path.join(OUTPUT_DIR, "train_synthetic.csv")
eval_path = os.path.join(OUTPUT_DIR, "eval_synthetic.csv")

train_df.to_csv(train_path, index=False)
eval_df.to_csv(eval_path, index=False)

print(f"Training data saved to: {train_path}")
print(f"  - Size: {len(train_df)} samples")
print(f"  - Class 0: {(train_df['label']==0).sum()}")
print(f"  - Class 1: {(train_df['label']==1).sum()}")

print(f"\nEvaluation data saved to: {eval_path}")
print(f"  - Size: {len(eval_df)} samples")
print(f"  - Class 0: {(eval_df['label']==0).sum()}")
print(f"  - Class 1: {(eval_df['label']==1).sum()}")