# Dataset Creator Example Notebook

This notebook demonstrates how to use the Dataset Creator library to create datasets from GitHub repositories.

In [None]:
# Import required libraries
import json
import os
from pathlib import Path

# Import dataset tools
from config.credentials_manager import CredentialsManager
from github.client import GitHubClient
from huggingface.dataset_creator import DatasetCreator

## Configure credentials

First, we need to set up our credentials for GitHub and Hugging Face.

In [None]:
# Initialize credentials manager
credentials = CredentialsManager()

# Get GitHub credentials
github_username, github_token = credentials.get_github_credentials()
print(f"GitHub username: {github_username}")

# Get Hugging Face credentials
hf_username, hf_token = credentials.get_huggingface_credentials()
print(f"HuggingFace username: {hf_username}")

## Create a dataset

Now let's fetch some data and create a dataset.

In [None]:
# Initialize GitHub client
github_client = GitHubClient(token=github_token)

# Fetch repository contents
repo_owner = "huggingface"
repo_name = "datasets"
files = github_client.get_repository_contents(
    repo_owner, repo_name, path="src/datasets/packaged_modules/csv"
)

# Print file information
for file in files[:5]:  # Show first 5 files
    print(f"File: {file['name']}")

In [None]:
# Create dataset creator
dataset_creator = DatasetCreator()

# Create a simple dataset
success, dataset = dataset_creator.create_dataset(
    file_data_list=files[:5],  # Use first 5 files
    dataset_name="github_files_sample",
    description="A sample dataset created from GitHub files",
)

# Display results
if success:
    print(f"Successfully created dataset with {len(dataset)} examples")
else:
    print("Failed to create dataset")