# Code that cleans names via LLM

In [11]:
import csv
import json
import re
import os
import sys
import openai

In [16]:
def read_names_csv(filepath, id_col, name_col):
    """Read a semicolon-separated CSV file and extract ID and name columns."""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter=';')
        for row in reader:
            id_value = row[id_col].strip()
            name = row[name_col].strip()
            if name:  # Only add non-empty names
                data.append((id_value, name))
    return data

def create_batch(all_pairs, batch_size, batch_num):
    """Create a single batch of the specified size and batch number."""
    start_idx = batch_num * batch_size
    end_idx = min(start_idx + batch_size, len(all_pairs))
    
    # Check if the batch number is valid
    if start_idx >= len(all_pairs):
        print(f"Error: Batch {batch_num} is out of range.")
        return None
    
    # Get just the specified batch
    batch_data = all_pairs[start_idx:end_idx]
    return batch_data

def make_prompt(id_name_pairs):
    """Create the instruction prompt for name formatting"""
    instruction = (
        "You are an assistant that helps format names from a danish database."
        "You will receive lines in the format `ID; Name`. "
        "For each, return a JSON object with the following fields:\n"
        "- id: The original ID\n"
        "- formattedName: The name with proper spacing and capitalization. Don't fix spelling - only *very obvious* truncations at beginning/end of names (e.g. Anderse→Andersen, ohan→Johan) "
        "and replace non-letter characters with the most likely letter (e.g., 7 could be V).\n"
        "- abbreviatedName: Name with all parts except the last name abbreviated\n"
        "- alreadyAbbreviated: Boolean indicating if the name was already abbreviated\n\n"
        "- You must output exactly one JSON object for each input line. No more, no less. The output must be a valid JSON array containing only the requested objects with the specified fields. Any other format or additional information is not acceptable\n"
        "Return the results as a JSON array.\n\n"
        "Here are the names:\n"
    )
    
    # Construct the list of names
    names_text = ""
    for id_, name in id_name_pairs:
        names_text += f"{id_}; {name}\n"
    
    return instruction + names_text

def call_api(prompt, api_key, model="gpt-4.1"):
    """Call OpenAI API with the prompt using the new client format"""
    from openai import OpenAI
    
    # Initialize client with the API key
    client = OpenAI(api_key=api_key)
    
    # System instruction can be included in the prompt
    system_instruction = "You are a helpful assistant that formats names and returns data in JSON format."
    full_prompt = f"{system_instruction}\n\n{prompt}"
    
    # Call the API using the responses endpoint
    response = client.responses.create(
        model=model,
        input=full_prompt,
        temperature=0.2
    )
    
    return response.output_text

def validate_json(result):
    """Validate that the response is proper JSON"""
    try:
        json_result = json.loads(result)
        print(f"JSON parsed successfully with {len(json_result)} entries")
        return json_result
    except json.JSONDecodeError as je:
        print(f"Invalid JSON. Error: {je}")
        print(f"First 200 chars of response: {result[:200]}...")
        return None

def clean_json_string(text):
    """
    Remove Markdown code block formatting from a string.
    Handles both ```json and plain ``` delimiters.
    """
    # Remove opening ```json or ``` marker and any whitespace after it
    text = re.sub(r'^\s*```(?:json)?\s*', '', text)
    
    # Remove closing ``` marker and any whitespace before it
    text = re.sub(r'\s*```\s*$', '', text)
    
    return text

def process_batch(input_file, output_file_base, batch_size, batch_num, id_col, name_col, api_key, model):
    """Process a specific batch from CSV file through API and save results to a numbered output file"""
    # Create a uniquely numbered output file
    output_file = f"{output_file_base}_{batch_num}.json"
    
    # 1. Read all names from the CSV
    all_names = read_names_csv(input_file, id_col, name_col)
    
    # 2. Create the specific batch
    batch_data = create_batch(all_names, batch_size, batch_num)
    if batch_data is None:
        return False
    
    # 3. Create prompt
    prompt = make_prompt(batch_data)
    
    # 4. Call API
    result = call_api(prompt, api_key, model)
    
    result = clean_json_string(result)
    
    # 5. Validate JSON
    json_result = validate_json(result)
    
    # 6. Write results
    with open(output_file, 'w', encoding='utf-8') as out:
        if json_result:
            out.write(json.dumps(json_result, indent=2))
            return True
        else:
            out.write(f"INVALID JSON:\n{result}")
            return False

def process_multiple_batches(input_file, output_file_base, batch_size, total_batches, id_col, name_col, api_key, model, start_batch=0):
    """
    Process multiple batches and stop if any batch fails validation
    
    Parameters:
    -----------
    start_batch : int, optional (default=0)
        The batch number to start processing from
    """
    for batch_num in range(start_batch, total_batches):
        print(f"Processing batch {batch_num}...")
        success = process_batch(input_file, output_file_base, batch_size, batch_num, id_col, name_col, api_key, model)
        
        if not success:
            print(f"ERROR: Batch {batch_num} failed validation!")
            if batch_num > start_batch:
                print(f"Last successfully completed batch was: {batch_num - 1}")
            else:
                print("No batches were successfully completed in this run.")
            return batch_num - 1 if batch_num > start_batch else None
    
    print(f"All batches from {start_batch} to {total_batches-1} processed successfully!")
    return total_batches - 1

In [18]:
# Create the output directory if it doesn't exist
output_file_base = "input/raw_data_clean.csv"
output_dir = os.path.dirname(output_file_base)
if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# Configuration
input_file = "input/raw_data.csv"  # Replace with your input file path
batch_size = 62  # Number of names in each batch
total_batches = 100  # Total number of batches to process
id_col = "id"  # Column name for ID
name_col = "name"  # Column name for Name
api_key = # insert valid api_key
model = "gpt-4.1"  # API model to use
start_batch = 0  # Start from batch 11 (the one that failed)

# Run the batch processing
last_successful_batch = process_multiple_batches(
    input_file, output_file_base, batch_size, total_batches,
    id_col, name_col, api_key, model, start_batch
)

# Print summary
if last_successful_batch is not None:
    print(f"Processing ended. Last successful batch: {last_successful_batch}")
else:
    print(f"Processing ended. No batches after {start_batch} were processed successfully.")

Processing batch 0...
JSON parsed successfully with 62 entries
Processing batch 1...
Error: Batch 1 is out of range.
ERROR: Batch 1 failed validation!
Last successfully completed batch was: 0
Processing ended. Last successful batch: 0
