In [None]:
import datasets
from pathlib import Path
from typing import List, Optional, Union

import numpy as np
from gluonts.dataset.arrow import ArrowWriter
import pyarrow as pa
import pandas as pd

def convert_to_arrow_chunked(
    path: Union[str, Path],
    dataset,
    chunk_size: int = 10000,
    compression: str = "lz4",
):

    
    first_chunk = True
    
    with pa.OSFile(path, 'wb') as f:
        for i in range(0, len(dataset), chunk_size):
            chunk = dataset[i:i + chunk_size]
            chunk_data = []
            
            # Get the values as lists
            ids = chunk['id']
            targets = chunk['target']
            timestamps = chunk['timestamp']
            # Zip the values together and process each row
            for id_val, target_val, timestamp_val in zip(ids, targets, timestamps):
                chunk_data.append({
                    "start": timestamp_val[0],
                    "target": target_val
                })
            
            # Convert to Arrow table
            df_chunk = pd.DataFrame(chunk_data)
            table_chunk = pa.Table.from_pandas(df_chunk)
            
            # Initialize writer with schema from first chunk
            if first_chunk:
                schema = table_chunk.schema
                writer = pa.ipc.new_file(f, schema)
                first_chunk = False
            
            # Write chunk
            writer.write(table_chunk)
            print(f"Processed {i + len(chunk)} / {len(dataset)} rows")
        
        writer.close()

# # Load and process dataset
ds = datasets.load_dataset("autogluon/chronos_datasets", "training_corpus_tsmixup_10m", split="train")
ds.set_format("numpy")
# Convert to arrow format in chunks
convert_to_arrow_chunked("./tsmixup-data.arrow", ds, chunk_size=10000)
