# Dataset Schema Update Script

In [20]:
import os
from pathlib import Path
from datasets import load_dataset, Dataset
from dotenv import load_dotenv

# Load environment variables from root .env
load_dotenv(Path('../..') / '.env')

# Configuration
DATASET_REPO = "Cantina/dj-image-train-data-20251105_v3"
HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')

if not HF_TOKEN:
    raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")

print(f"✓ Environment configured")
print(f"✓ Dataset: {DATASET_REPO}")

✓ Environment configured
✓ Dataset: Cantina/dj-image-train-data-20251105_v3


## Load the Dataset from Hugging Face

In [21]:
print("Loading dataset from Hugging Face...")
dataset = load_dataset(
    DATASET_REPO,
    split="train",
    token=HF_TOKEN
)

print(f"✓ Loaded {len(dataset)} rows")
print(f"\nCurrent schema:")
print(dataset.features)

print(f"\nFirst row sample:")
print(dataset[0])

Loading dataset from Hugging Face...


README.md:   0%|          | 0.00/629 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/8.01M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/877k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15023 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1670 [00:00<?, ? examples/s]

✓ Loaded 15023 rows

Current schema:
{'prompt_name': Value('string'), 'new_room_unified_format_input': Value('string'), 'unified_format_output_enriched_fixed': Value('string'), 'gpt5-results-20250905': Value('string'), 'gpt5-results-20251104': Value('string')}

First row sample:
{'prompt_name': 'mention_not_in_history_prompt', 'new_room_unified_format_input': 'ROOM MEMBERS:[\n  {"user_name": "dj-marley", "full_name": "Marley"},\n  {"user_name": "Lucas", "full_name": "Lucas"},\n  {"user_name": "Olivia", "full_name": "Olivia"},\n  {"user_name": "dj-aria", "full_name": "Aria"}\n]\n\nCHAT HISTORY:\n\n\ndj-marley (bot): Just wrapped up my blues-rock playlist, lots of soulful guitar tonight. Hope everyone vibed with it!\ndj-marley (bot): That Stevie Ray Vaughan track always hits different, especially late at night.\nLucas: Totally agree, Marley, SRV\'s solos are next-level. Always makes me nostalgic.\ndj-marley (bot): For real, Lucas! "Texas Flood" is pure art. Glad you liked it!\nOlivia: Yo

## Transform Dataset Schema

Rename columns, remove old ones, and add annotation tracking fields.

In [22]:
# Step 1: Rename columns
print("Renaming columns...")
dataset = dataset.rename_column("new_room_unified_format_input", "input")
dataset = dataset.rename_column("gpt5-results-20251104", "output")
print("✓ Renamed: new_room_unified_format_input -> input")
print("✓ Renamed: gpt5-results-20251104 -> output")

# Step 2: Remove unwanted columns
print("\nRemoving columns...")
columns_to_remove = ["unified_format_output_enriched_fixed", "gpt5-results-20250905"]
dataset = dataset.remove_columns(columns_to_remove)
print(f"✓ Removed: {', '.join(columns_to_remove)}")

# Step 3: Add annotation columns
print("\nAdding annotation columns...")
def add_annotation_columns(example):
    example['manually_reviewed'] = False
    example['manually_reviewed_ts'] = 0
    example['last_updated_ts'] = ''
    return example

dataset = dataset.map(add_annotation_columns)
print("✓ Added: manually_reviewed (bool)")
print("✓ Added: manually_reviewed_ts (int64)")
print("✓ Added: last_updated_ts (string)")

print(f"\n✓ Transformation complete!")
print(f"\nFinal schema:")
print(dataset.features)

Renaming columns...
✓ Renamed: new_room_unified_format_input -> input
✓ Renamed: gpt5-results-20251104 -> output

Removing columns...
✓ Removed: unified_format_output_enriched_fixed, gpt5-results-20250905

Adding annotation columns...


Map:   0%|          | 0/15023 [00:00<?, ? examples/s]

✓ Added: manually_reviewed (bool)
✓ Added: manually_reviewed_ts (int64)
✓ Added: last_updated_ts (string)

✓ Transformation complete!

Final schema:
{'prompt_name': Value('string'), 'input': Value('string'), 'output': Value('string'), 'manually_reviewed': Value('bool'), 'manually_reviewed_ts': Value('int64'), 'last_updated_ts': Value('string')}


## Validate Transformed Dataset

In [23]:
print("Final columns:")
for col_name in dataset.column_names:
    print(f"  ✓ {col_name}")

print(f"\nTotal rows: {len(dataset)}")

print("\nSample row:")
sample = dataset[0]
for key in ['prompt_name', 'input', 'output', 'manually_reviewed', 'manually_reviewed_ts', 'last_updated_ts']:
    if key in sample:
        value = sample[key]
        if isinstance(value, str) and len(value) > 100:
            value = value[:100] + "..."
        print(f"  {key}: {value}")

Final columns:
  ✓ prompt_name
  ✓ input
  ✓ output
  ✓ manually_reviewed
  ✓ manually_reviewed_ts
  ✓ last_updated_ts

Total rows: 15023

Sample row:
  prompt_name: mention_not_in_history_prompt
  input: ROOM MEMBERS:[
  {"user_name": "dj-marley", "full_name": "Marley"},
  {"user_name": "Lucas", "full_n...
  output: {"action": "dj", "requester": "Lucas", "requested_users": ["dj-aria"], "action_metadata": {"prompt":...
  manually_reviewed: False
  manually_reviewed_ts: 0
  last_updated_ts: 


## Clean Up Repository

Delete all existing files (except .gitattributes) to ensure a clean schema update.

In [25]:
from huggingface_hub import HfApi, CommitOperationDelete

api = HfApi()

print("Performing complete cleanup of repository...")
try:
    # List all files in the repo
    repo_files = api.list_repo_files(DATASET_REPO, repo_type="dataset", token=HF_TOKEN)

    print(f"Files in repo: {len(repo_files)} files")

    # Files to delete - everything except .gitattributes
    files_to_delete = [f for f in repo_files if f != '.gitattributes']

    if files_to_delete:
        print(f"\nDeleting {len(files_to_delete)} files:")
        for file in files_to_delete:
            print(f"  - {file}")

        # Delete all files in one commit
        operations = [CommitOperationDelete(path_in_repo=file) for file in files_to_delete]

        api.create_commit(
            repo_id=DATASET_REPO,
            repo_type="dataset",
            operations=operations,
            token=HF_TOKEN,
            commit_message="Delete all files before schema update"
        )

        print("\n✓ All files deleted!")
    else:
        print("No files to delete")

except Exception as e:
    print(f"Error during cleanup: {e}")
    import traceback
    traceback.print_exc()
    raise

Performing complete cleanup of repository...
Files in repo: 1 files
No files to delete


## Push to Hugging Face

Upload the transformed dataset with the new schema.

In [26]:
print("Pushing to Hugging Face...")
print(f"  Repository: {DATASET_REPO}")
print(f"  Rows: {len(dataset)}")
print(f"  Columns: {len(dataset.column_names)}")

dataset.push_to_hub(
    DATASET_REPO,
    token=HF_TOKEN,
    commit_message="Transform dataset: rename columns, remove old ones, add annotation fields"
)

print(f"\n✓ Successfully pushed!")
print(f"\nView at: https://huggingface.co/datasets/{DATASET_REPO}")

Pushing to Hugging Face...
  Repository: Cantina/dj-image-train-data-20251105_v3
  Rows: 15023
  Columns: 6


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            


✓ Successfully pushed!

View at: https://huggingface.co/datasets/Cantina/dj-image-train-data-20251105_v3
