In [56]:
import datasets
from pathlib import Path
from typing import List, Optional, Union

import numpy as np
from gluonts.dataset.arrow import ArrowWriter


def convert_to_arrow_chunked(
    path: Union[str, Path],
    dataset,
    chunk_size: int = 10000,
    compression: str = "lz4",
):
    import pyarrow as pa
    import pandas as pd
    
    first_chunk = True
    
    with pa.OSFile(path, 'wb') as f:
        for i in range(0, len(dataset), chunk_size):
            chunk = dataset[i:i + chunk_size]
            chunk_data = []
            
            # Get the values as lists
            ids = chunk['id']
            targets = chunk['target']
            timestamps = chunk['timestamp']
            # Zip the values together and process each row
            for id_val, target_val, timestamp_val in zip(ids, targets, timestamps):
                chunk_data.append({
                    "start": timestamp_val[0],
                    "target": target_val
                })
            
            # Convert to Arrow table
            df_chunk = pd.DataFrame(chunk_data)
            table_chunk = pa.Table.from_pandas(df_chunk)
            
            # Initialize writer with schema from first chunk
            if first_chunk:
                schema = table_chunk.schema
                writer = pa.ipc.new_file(f, schema)
                first_chunk = False
            
            # Write chunk
            writer.write(table_chunk)
            print(f"Processed {i + len(chunk)} / {len(dataset)} rows")
            break
        writer.close()

# # Load and process dataset
# ds = datasets.load_dataset("autogluon/chronos_datasets", "training_corpus_tsmixup_10m", split="train")
# ds.set_format("numpy")



In [57]:
# Convert to arrow format in chunks
convert_to_arrow_chunked("./tsmixup-data.arrow", ds, chunk_size=10000)

Processed 3 / 10000000 rows
Processed 10003 / 10000000 rows
Processed 20003 / 10000000 rows
Processed 30003 / 10000000 rows
Processed 40003 / 10000000 rows
Processed 50003 / 10000000 rows
Processed 60003 / 10000000 rows
Processed 70003 / 10000000 rows
Processed 80003 / 10000000 rows
Processed 90003 / 10000000 rows
Processed 100003 / 10000000 rows
Processed 110003 / 10000000 rows
Processed 120003 / 10000000 rows
Processed 130003 / 10000000 rows
Processed 140003 / 10000000 rows
Processed 150003 / 10000000 rows
Processed 160003 / 10000000 rows
Processed 170003 / 10000000 rows
Processed 180003 / 10000000 rows
Processed 190003 / 10000000 rows
Processed 200003 / 10000000 rows
Processed 210003 / 10000000 rows
Processed 220003 / 10000000 rows
Processed 230003 / 10000000 rows
Processed 240003 / 10000000 rows
Processed 250003 / 10000000 rows
Processed 260003 / 10000000 rows
Processed 270003 / 10000000 rows
Processed 280003 / 10000000 rows
Processed 290003 / 10000000 rows
Processed 300003 / 10000

In [3]:
# Process in chunks
import pyarrow as pa
import pandas as pd

chunk_size = 5000  # Adjust this based on your available RAM
schema = None
first_chunk = True
processed_rows = 0

with pa.OSFile('tsmixup-data.arrow', 'wb') as f:
    # Iterate over the streaming dataset directly
    for chunk in ds.iter(batch_size=chunk_size):
        # Convert to pandas and process
        df_chunk = pd.DataFrame(chunk)
        df_chunk['start'] = df_chunk['timestamp'].apply(lambda x: pd.to_datetime(x[0]))
        df_chunk = df_chunk[['start', 'target']]  # Keep only needed columns
        
        # Convert to Arrow table
        table_chunk = pa.Table.from_pandas(df_chunk)
        
        # Initialize writer with schema from first chunk
        if first_chunk:
            schema = table_chunk.schema
            writer = pa.ipc.new_file(f, schema)
            first_chunk = False
    
        # Write chunk
        writer.write(table_chunk)
        
        # Update and print progress
        processed_rows += len(df_chunk)
        print(f"Processed {processed_rows} rows")
    
    writer.close()

Processed 5000 rows
Processed 10000 rows
Processed 15000 rows
Processed 20000 rows
Processed 25000 rows
Processed 30000 rows
Processed 35000 rows
Processed 40000 rows
Processed 45000 rows
Processed 50000 rows
Processed 55000 rows
Processed 60000 rows
Processed 65000 rows
Processed 70000 rows
Processed 75000 rows
Processed 80000 rows
Processed 85000 rows
Processed 90000 rows
Processed 95000 rows
Processed 100000 rows
Processed 105000 rows
Processed 110000 rows
Processed 115000 rows
Processed 120000 rows
Processed 125000 rows
Processed 130000 rows
Processed 135000 rows
Processed 140000 rows
Processed 145000 rows
Processed 150000 rows
Processed 155000 rows
Processed 160000 rows
Processed 165000 rows
Processed 170000 rows
Processed 175000 rows
Processed 180000 rows
Processed 185000 rows
Processed 190000 rows
Processed 195000 rows
Processed 200000 rows
Processed 205000 rows
Processed 210000 rows
Processed 215000 rows
Processed 220000 rows
Processed 225000 rows
Processed 230000 rows
Processe

In [42]:
!python ../scripts/training/train.py --config ./config.yaml

2024-12-31 23:50:15,273 - /home/arda/Documents/chronos-forecasting/notebooks/../scripts/training/train.py - INFO - Using SEED: 4034910953
2024-12-31 23:50:15,277 - /home/arda/Documents/chronos-forecasting/notebooks/../scripts/training/train.py - INFO - Logging dir: output/run-8
2024-12-31 23:50:15,277 - /home/arda/Documents/chronos-forecasting/notebooks/../scripts/training/train.py - INFO - Loading and filtering 2 datasets for training: ['/home/arda/Documents/chronos-forecasting/data/kernelsynth-data.arrow', '/home/arda/Documents/chronos-forecasting/notebooks/tsmixup-data.arrow']
2024-12-31 23:50:15,277 - /home/arda/Documents/chronos-forecasting/notebooks/../scripts/training/train.py - INFO - Mixing probabilities: [0.9, 0.1]
Traceback (most recent call last):
  File "/home/arda/Documents/chronos-forecasting/notebooks/../scripts/training/train.py", line 702, in <module>
    app()
  File "/home/arda/anaconda3/envs/chronos/lib/python3.10/site-packages/typer/main.py", line 340, in __call__

In [10]:
import pyarrow.parquet as pq
import pyarrow as pa

# Open the Arrow file
with pa.memory_map('tsmixup-data.arrow', 'r') as source:
    reader = pa.ipc.open_file(source)
    table = reader.read_all()

# Convert to pandas DataFrame for easier inspection
df = table.to_pandas()
print("Data shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nData info:")
print(df.info())


Data shape: (11000, 2)

First few rows:
       start                                             target
0 1970-01-01  [0.613463059343371, 0.5711616398408842, 0.5182...
1 1970-01-01  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...
2 1970-01-01  [1.0770299311167582, 4.308119724467033, 1.0770...
3 1970-01-01  [0.004624637833987918, 0.00547248810355237, 0....
4 1970-01-01  [0.4764301971138915, 0.49643825495907135, 0.53...

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000 entries, 0 to 10999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   start   11000 non-null  datetime64[ns]
 1   target  11000 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 172.0+ KB
None


In [6]:
import pyarrow.parquet as pq
import pyarrow as pa

# Open the Arrow file
with pa.memory_map('/home/arda/Documents/chronos-forecasting/data/kernelsynth-data.arrow', 'r') as source:
    reader = pa.ipc.open_file(source)
    table = reader.read_all()

# Convert to pandas DataFrame for easier inspection
df = table.to_pandas()
print("Data shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nData info:")
print(df.info())


Data shape: (100000, 2)

First few rows:
       start                                             target
0 2000-01-01  [0.0847358005868161, 0.08490204073316547, 0.08...
1 2000-01-01  [0.5819332221162387, 0.4920879847852638, 0.327...
2 2000-01-01  [0.7310408977923173, 0.42255510512682065, 0.59...
3 2000-01-01  [0.25454405428260085, 0.26002409188736336, 0.2...
4 2000-01-01  [0.09628723019827937, 0.6191403183181314, -0.1...

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype        
---  ------  --------------   -----        
 0   start   100000 non-null  datetime64[s]
 1   target  100000 non-null  object       
dtypes: datetime64[s](1), object(1)
memory usage: 1.5+ MB
None
