<a href="https://colab.research.google.com/github/b05902062/user-intention/blob/main/GenerateDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!huggingface-cli login

In [None]:
!pip install --upgrade datasets huggingface_hub fsspec

In [None]:
new_repo_name = input("Please enter the new Hugging Face repository name (e.g., 'your-username/your-repo'): ").strip()

In [None]:
import re
import json
from datasets import load_dataset, Dataset

def transform_example(example):
    """
    Transforms a single example from the original dataset structure
    to the new desired structure. Returns None if an error occurs during transformation.
    """
    new_example = {}

    # 1. Transform 'system' field: Remove "SYSTEM: " prefix
    system_text = example['system']
    if system_text.startswith('SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -\n'):
        new_example['system'] = system_text[len('SYSTEM: You are a helpful assistant with access to the following functions. Use them if required -\n'):].strip()
    else:
        new_example['system'] = system_text.strip()

    # 2. Extract the first USER and first ASSISTANT block from 'chat' field
    chat_content = example['chat']

    # Regex to find the first USER block and the subsequent ASSISTANT block.
    # It captures the user's message and the assistant's immediate response.
    # We use non-greedy matching (.*?) and look for various termination markers
    # for the assistant's response: <|endoftext|>, FUNCTION RESPONSE:, or the next USER:
    chat_match = re.match(
        r'USER: (.*?)\n*\s*ASSISTANT: (.*?)(?:<\|endoftext\|>|\n*\s*FUNCTION RESPONSE:|\n*\s*USER:|$)',
        chat_content,
        re.DOTALL
    )

    if chat_match:
        user_message = chat_match.group(1).strip()
        assistant_raw = chat_match.group(2).strip()
    else:
        # Fallback for cases that might not perfectly match the regex,
        # though the provided example fits. This handles if a chat only has USER or ASSISTANT.
        print(f"Warning: Chat only has USER or ASSISTANT. Skipping example. Raw: {chat_content}")
        return None # Skip this example

    new_example['user'] = user_message

    # 3. Transform 'assistant' field based on its content
    function_call_pattern = re.compile(r'<functioncall> ({.*})')
    func_call_match = function_call_pattern.search(assistant_raw)

    if func_call_match:
        # If it's a function call, parse the JSON to get the function name
        func_call_str = func_call_match.group(1)
        try:
          # Instead of json.loads(func_call_str) which fails due to single quotes
          # within the arguments string, we'll use regex to directly extract the function name.
          name_match = re.search(r'"name": "(.*?)"', func_call_str)
          if name_match:
              function_name = name_match.group(1)
              new_example['assistant'] = json.dumps({
                "function": {
                    function_name: {
                        "intent": "true"
                    }
                }
              })
          else:
              # If function name is missing or regex fails to find it, skip example
              print(f"Warning: Function name not found in function call string. Skipping example. Raw: {func_call_str}")
              return None
        except json.JSONDecodeError as e:
            print(f"Warning: Could not parse function call JSON for example: {e}. Skipping example. Raw: {chat_content}")
            print(f"Warning: Could not parse function call JSON for example: {e}. Skipping example. Raw: {func_call_str}")
            return None # Skip this example
    else:
        # If not a function call, keep the original text, removing <|endoftext|>
        new_example['assistant'] = assistant_raw.replace('<|endoftext|>', '').strip()

    return new_example

def main():
    # Define the original dataset and the new repository name
    original_dataset_name = 'glaiveai/glaive-function-calling-v2'
    # IMPORTANT: Replace 'your-username' with your Hugging Face username
    # and 'your-new-repo-name' with the desired name for your new dataset.
    # You will need to be logged in to Hugging Face CLI or provide a token.

    print(f"Loading dataset: {original_dataset_name}")
    # Load the training split of the dataset
    dataset = load_dataset(original_dataset_name, split='train')
    print(f"Dataset loaded with {len(dataset)} examples.")

    print("Transforming dataset examples...")
    transformed_data = []
    skipped_count = 0
    for i, example in enumerate(dataset):
        try:
            transformed_example = transform_example(example)
            if transformed_example is not None:
                transformed_data.append(transformed_example)
            else:
                skipped_count += 1
        except Exception as e:
            print(f"Error transforming example {i}: {e}. Skipping row.")
            skipped_count += 1

    print(f"Finished transforming {len(transformed_data)} examples. Skipped {skipped_count} rows due to errors.")

    # Create a new Dataset object from the transformed list
    new_dataset = Dataset.from_list(transformed_data)
    print("New dataset created. Sample of transformed data:")
    # Print a sample of the transformed data for verification
    for i in range(min(3, len(new_dataset))):
        print(f"\n--- Transformed Example {i+1} ---")
        print(json.dumps(new_dataset[i], indent=2))

    print(f"\nAttempting to push transformed dataset to Hugging Face Hub: {new_repo_name}")
    # To push to the Hugging Face Hub, you need to be logged in.
    # You can do this by running `huggingface-cli login` in your terminal,
    # or by providing your token programmatically:
    # from huggingface_hub import login
    # login(token="hf_YOUR_TOKEN_HERE")
    try:
        new_dataset.push_to_hub(new_repo_name)
        print(f"Dataset successfully pushed to https://huggingface.co/datasets/{new_repo_name}")
    except Exception as e:
        print(f"Failed to push dataset to Hugging Face Hub. Please ensure you are logged in via 'huggingface-cli login' and have write permissions for '{new_repo_name}'. Error: {e}")

if __name__ == "__main__":
    main()