# Dataset Schema Update Script

In [12]:
import os
from pathlib import Path
from datasets import load_dataset, Dataset
from dotenv import load_dotenv

# Load environment variables from root .env
load_dotenv(Path('../..') / '.env')

# Configuration
DATASET_REPO = "Cantina/intent-full-data-20251106"
HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

if not HF_TOKEN:
    raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")

print(f"✓ Environment configured")
print(f"✓ Dataset: {DATASET_REPO}")

✓ Environment configured
✓ Dataset: Cantina/intent-full-data-20251106


## Load the Dataset from Hugging Face

In [None]:
print("Loading dataset from Hugging Face...")
dataset = load_dataset(
    DATASET_REPO,
    split="train",
    token=HF_TOKEN
)

print(f"✓ Loaded {len(dataset)} rows")
print(f"\nCurrent schema:")
print(dataset.features)

print(f"\nFirst row sample:")
print(dataset[0])

## Transform Dataset Schema

Rename columns, remove old ones, and add annotation tracking fields.

In [None]:
# Step 1: Rename columns
print("Renaming columns...")
dataset = dataset.rename_column("new_room_unified_format_input", "input")
dataset = dataset.rename_column("gpt5-results-20251104", "output")
print("✓ Renamed: new_room_unified_format_input -> input")
print("✓ Renamed: gpt5-results-20251104 -> output")

# Step 2: Remove unwanted columns
print("\nRemoving columns...")
columns_to_remove = ["unified_format_output_enriched_fixed", "gpt5-results-20250905"]
dataset = dataset.remove_columns(columns_to_remove)
print(f"✓ Removed: {', '.join(columns_to_remove)}")

# Step 3: Add annotation columns
print("\nAdding annotation columns...")
def add_annotation_columns(example):
    example['manually_reviewed'] = False
    example['manually_reviewed_ts'] = 0
    example['last_updated_ts'] = ''
    return example

dataset = dataset.map(add_annotation_columns)
print("✓ Added: manually_reviewed (bool)")
print("✓ Added: manually_reviewed_ts (int64)")
print("✓ Added: last_updated_ts (string)")

print(f"\n✓ Transformation complete!")
print(f"\nFinal schema:")
print(dataset.features)

## Validate Transformed Dataset

In [None]:
print("Final columns:")
for col_name in dataset.column_names:
    print(f"  ✓ {col_name}")

print(f"\nTotal rows: {len(dataset)}")

print("\nSample row:")
sample = dataset[0]
for key in ['prompt_name', 'input', 'output', 'manually_reviewed', 'manually_reviewed_ts', 'last_updated_ts']:
    if key in sample:
        value = sample[key]
        if isinstance(value, str) and len(value) > 100:
            value = value[:100] + "..."
        print(f"  {key}: {value}")

## Clean Up Repository

Delete all existing files (except .gitattributes) to ensure a clean schema update.

In [None]:
from huggingface_hub import HfApi, CommitOperationDelete

api = HfApi()

print("Performing complete cleanup of repository...")
try:
    # List all files in the repo
    repo_files = api.list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
    
    print(f"Files in repo: {len(repo_files)} files")
    
    # Files to delete - everything except .gitattributes
    files_to_delete = [f for f in repo_files if f != '.gitattributes']
    
    if files_to_delete:
        print(f"\nDeleting {len(files_to_delete)} files:")
        for file in files_to_delete:
            print(f"  - {file}")
        
        # Delete all files in one commit
        operations = [CommitOperationDelete(path_in_repo=file) for file in files_to_delete]
        
        api.create_commit(
            repo_id=DATASET_REPO,
            repo_type="dataset",
            operations=operations,
            token=HF_TOKEN,
            commit_message="Delete all files before schema update"
        )
        
        print("\n✓ All files deleted!")
    else:
        print("No files to delete")
        
except Exception as e:
    print(f"Error during cleanup: {e}")
    import traceback
    traceback.print_exc()
    raise

## Push to Hugging Face

Upload the transformed dataset with the new schema.

In [None]:
print("Pushing to Hugging Face...")
print(f"  Repository: {DATASET_REPO}")
print(f"  Rows: {len(dataset)}")
print(f"  Columns: {len(dataset.column_names)}")

dataset.push_to_hub(
    DATASET_REPO,
    token=HF_TOKEN,
    commit_message="Transform dataset: rename columns, remove old ones, add annotation fields"
)

print(f"\n✓ Successfully pushed!")
print(f"\nView at: https://huggingface.co/datasets/{DATASET_REPO}")