# Dataset Management with Weave

This notebook demonstrates the dataset management capabilities of the Weave framework.

In [None]:
from weave.datasets import (
    DatasetLoader,
    DatasetMerger,
    HuggingFaceDataset,
    StreamingDataset
)
import pandas as pd

## 1. Loading Data from Various Sources

Demonstrate different data loading capabilities.

In [None]:
# Load from CSV
loader = DatasetLoader()
csv_data = loader.load("path/to/data.csv")

# Load from Kaggle
kaggle_data = loader.load("kaggle://username/dataset/file.csv")

# Load from SQL database
sql_data = loader.load("sqlite:///path/to/database.db")

print("Data sources loaded successfully!")

## 2. HuggingFace Dataset Integration

Work with datasets from the HuggingFace Hub.

In [None]:
# Load dataset from HuggingFace
hf_dataset = HuggingFaceDataset({
    "split": "train",
    "transforms": {
        "text": "join_tokens"
    }
})

data = hf_dataset.load("owner/dataset-name")
print(f"Features: {data.get_features()}")
print(f"Available splits: {data.get_splits()}")

## 3. Streaming Large Datasets

Handle large datasets efficiently using streaming.

In [None]:
# Stream large CSV file
with StreamingDataset({"chunk_size": 1000}) as stream:
    stream.load("path/to/large_file.csv")
    
    # Process in chunks
    for chunk in stream.iter_chunks():
        print(f"Processing chunk of size: {len(chunk)}")
        
    # Take first n records
    head = stream.take(5)
    print("\nFirst 5 records:")
    print(head)

## 4. Merging Synthetic and Real Data

Combine synthetic data with real datasets.

In [None]:
# Load real and synthetic data
real_data = pd.DataFrame({
    "text": ["Example 1", "Example 2"],
    "label": [0, 1]
})

synthetic_data = pd.DataFrame({
    "text": ["Synthetic 1", "Synthetic 2"],
    "label": [1, 0]
})

# Create merger
merger = DatasetMerger()

# Try different merge strategies
append_result = merger.merge(real_data, synthetic_data, strategy="append")
print("\nAppend strategy:")
print(append_result)

mix_result = merger.merge(real_data, synthetic_data, strategy="mix", ratio=0.3)
print("\nMix strategy (30% synthetic):")
print(mix_result)

# Analyze distributions
analysis = merger.analyze_distribution(real_data, synthetic_data)
print("\nDistribution analysis:")
print(analysis)

## 5. Dataset Preprocessing

Apply preprocessing steps to datasets.

In [None]:
# Configure preprocessing
loader = DatasetLoader({
    "fill_value": 0,
    "drop_duplicates": True,
    "type_conversions": {
        "numeric_col": "float32",
        "category_col": "category"
    }
})

# Load and preprocess
data = loader.load("path/to/data.csv").preprocess()

# Split dataset
splits = data.split(
    train_ratio=0.8,
    val_ratio=0.1,
    test_ratio=0.1,
    shuffle=True,
    seed=42
)

print("Dataset splits:")
for name, split in splits.items():
    print(f"{name}: {len(split)} records")