In [5]:
# =========================
# SETUP
# =========================

# Install dependencies
!pip install gluonts pandas numpy datasets pyarrow -q

# Import common libraries
import pandas as pd
import numpy as np
import os

# Create folder to save outputs
os.makedirs("datasets_csv", exist_ok=True)

print("Setup done ✅")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m286.7/491.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━

In [7]:
# =========================
# M4 Monthly
# =========================

# Import
from gluonts.dataset.repository.datasets import get_dataset

# Download
m4_dataset = get_dataset("m4_monthly")

# Extract and process
train_list = []
for i, entry in enumerate(m4_dataset.train):
    if i >= 100:  # Only 100 samples
        break
    ts = entry['target']
    id_ = entry['item_id']
    df = pd.DataFrame({
        'id': [id_] * len(ts),
        'value': ts,
        'timestamp': pd.date_range(start=entry['start'].to_timestamp(), periods=len(ts), freq='M')
    })
    train_list.append(df)

# Combine
m4_monthly_df = pd.concat(train_list, ignore_index=True)

# Save
m4_monthly_df.to_csv("datasets_csv/m4_monthly.csv", index=False)
print("Saved M4 monthly dataset ✅")


  'timestamp': pd.date_range(start=entry['start'].to_timestamp(), periods=len(ts), freq='M')


Saved M4 monthly dataset ✅


In [9]:
# =========================
# GluonTS Datasets
# =========================

# Function to download and save GluonTS dataset
def save_gluonts_dataset(name, filename_prefix):
    from gluonts.dataset.repository.datasets import get_dataset

    dataset = get_dataset(name)
    dfs = []
    count = 0
    for entry in dataset.train:
        if count >= 100:  # 100 samples only
            break
        ts = entry['target']
        id_ = entry['item_id']
        df = pd.DataFrame({
            'id': [id_] * len(ts),
            'value': ts,
            'timestamp': pd.date_range(start=entry['start'].to_timestamp(), periods=len(ts), freq='H')
        })
        dfs.append(df)
        count += 1
    final_df = pd.concat(dfs, ignore_index=True)
    final_df.to_csv(f"datasets_csv/{filename_prefix}.csv", index=False)
    print(f"Saved {filename_prefix} dataset ✅")

# Download and Save
save_gluonts_dataset("electricity", "gluonts_electricity")
save_gluonts_dataset("traffic", "gluonts_traffic")
save_gluonts_dataset("exchange_rate", "gluonts_exchange_rate")


  'timestamp': pd.date_range(start=entry['start'].to_timestamp(), periods=len(ts), freq='H')


Saved gluonts_electricity dataset ✅


  'timestamp': pd.date_range(start=entry['start'].to_timestamp(), periods=len(ts), freq='H')


Saved gluonts_traffic dataset ✅


  time_index = pd.period_range(
  time_index = pd.period_range(
  training_end + i * ds_info.prediction_length
  prediction_start_date + ds_info.prediction_length
  return pd.Period(val, freq)
  'timestamp': pd.date_range(start=entry['start'].to_timestamp(), periods=len(ts), freq='H')
  'timestamp': pd.date_range(start=entry['start'].to_timestamp(), periods=len(ts), freq='H')


Saved gluonts_exchange_rate dataset ✅


In [10]:
# =========================
# ETTm2 Dataset
# =========================

# Import
import pandas as pd

# Download the ETTm2 dataset
!wget -q https://raw.githubusercontent.com/zhouhaoyi/ETDataset/main/ETT-small/ETTm2.csv -P datasets_csv/

# Load
ett_df = pd.read_csv("datasets_csv/ETTm2.csv")
ett_df['date'] = pd.to_datetime(ett_df['date'])

# Resample to Monthly mean
ett_monthly = ett_df.set_index('date').resample('M').mean().reset_index()

# Take first 100 timestamps for simplicity
ett_monthly_sampled = ett_monthly.head(100)

# Save
ett_monthly_sampled.to_csv("datasets_csv/ettm2_monthly.csv", index=False)
print("Saved ETTm2 monthly dataset ✅")


Saved ETTm2 monthly dataset ✅


  ett_monthly = ett_df.set_index('date').resample('M').mean().reset_index()


In [11]:
# =========================
# BIGDATA22 (Simulated)
# =========================

# Import
import numpy as np
import pandas as pd

# Simulate stock price data
dates = pd.date_range(start='2010-01-01', end='2020-12-31', freq='B')  # Business days
np.random.seed(42)

stocks = [f'Stock{i}' for i in range(5)]  # Simulate 5 stock series
data = {stock: 100 + np.cumsum(np.random.randn(len(dates))) for stock in stocks}
bigdata_df = pd.DataFrame(data, index=dates)

# Resample to Monthly
bigdata_monthly = bigdata_df.resample('M').last().reset_index()

# Melt into long format
bigdata_monthly_melted = bigdata_monthly.melt(id_vars=['index'], var_name='id', value_name='value')
bigdata_monthly_melted.rename(columns={'index': 'timestamp'}, inplace=True)

# Take 100 rows
bigdata_monthly_sampled = bigdata_monthly_melted.head(100)

# Save
bigdata_monthly_sampled.to_csv("datasets_csv/bigdata22_monthly.csv", index=False)
print("Saved BIGDATA22 monthly dataset ✅")


Saved BIGDATA22 monthly dataset ✅


  bigdata_monthly = bigdata_df.resample('M').last().reset_index()


In [13]:
# =========================
# Weather Dataset
# =========================

# Import
import pandas as pd

# Download sample weather data
!wget -q https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv -P datasets_csv/

# Load
weather_df = pd.read_csv("datasets_csv/daily-min-temperatures.csv")
weather_df['Date'] = pd.to_datetime(weather_df['Date'])

# Resample to Monthly mean
weather_monthly = weather_df.set_index('Date').resample('M').mean().reset_index()

# Take first 100 rows
weather_monthly_sampled = weather_monthly.head(100)

# Save
weather_monthly_sampled.to_csv("datasets_csv/weather_monthly.csv", index=False)
print("Saved Weather monthly dataset ✅")


Saved Weather monthly dataset ✅


  weather_monthly = weather_df.set_index('Date').resample('M').mean().reset_index()


In [14]:
# First zip the folder
!zip -r datasets_csv.zip datasets_csv

# Then download
from google.colab import files
files.download('datasets_csv.zip')


  adding: datasets_csv/ (stored 0%)
  adding: datasets_csv/m4_monthly.csv (deflated 76%)
  adding: datasets_csv/daily-min-temperatures.csv (deflated 79%)
  adding: datasets_csv/ETTm2.csv (deflated 84%)
  adding: datasets_csv/gluonts_electricity.csv (deflated 86%)
  adding: datasets_csv/ettm2_monthly.csv (deflated 48%)
  adding: datasets_csv/weather_monthly.csv (deflated 74%)
  adding: datasets_csv/gluonts_traffic.csv (deflated 85%)
  adding: datasets_csv/bigdata22_monthly.csv (deflated 64%)
  adding: datasets_csv/gluonts_exchange_rate.csv (deflated 83%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>