# MTSFM Dataset Processing

This notebook processes macroeconomic datasets from Hugging Face and prepares them for use with the MacroTS model.

## Setup

First, let's install the required packages if they're not already installed.

In [None]:
# Install required packages
!pip install datasets pandas python-dotenv

## Configuration

Load environment variables and set up dataset paths.

In [None]:
import os
import pandas as pd
import glob
import re
from collections import defaultdict
from typing import Any, Generator, List
from datasets import load_dataset, Features, Sequence, Value, Dataset
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Use environment variables for paths or set defaults
MACRO_ECON_PATH = os.environ.get("MACRO_ECON_PATH", "./dataset")
CUSTOM_DATA_PATH = os.environ.get("CUSTOM_DATA_PATH", "./dataset")

# Create directory structure
csv_dataset_path = os.path.join(MACRO_ECON_PATH, "csv_dataset")
train_val_path = os.path.join(CUSTOM_DATA_PATH, "train_val_split")

# Create directories if they don't exist
os.makedirs(csv_dataset_path, exist_ok=True)
os.makedirs(train_val_path, exist_ok=True)

print(f"Using MACRO_ECON_PATH: {MACRO_ECON_PATH}")
print(f"Using CUSTOM_DATA_PATH: {CUSTOM_DATA_PATH}")
print(f"CSV dataset directory: {csv_dataset_path}")
print(f"Training/validation split directory: {train_val_path}")

## Download Country Data

First, we'll download data for each country from the MacroEcon Hugging Face dataset.

In [None]:
# Get the list of all available country codes
try:
    print("Loading dataset info...")
    dataset_info = load_dataset("bkoyuncu/MacroEcon", "features")
    country_codes = dataset_info['train'][:]['Country Code']
    print(f"Found {len(country_codes)} country codes")
    
    # Display first few country codes
    print(f"Sample country codes: {country_codes[:5]}")
except Exception as e:
    print(f"Error loading dataset info: {e}")
    country_codes = []

In [None]:
# output_path

In [None]:
# Loop through each country code and download data
for country in country_codes:
    config_name = f"{country}_data"
    output_path = os.path.join(csv_dataset_path, f"{country}_features.csv")
    
    # Skip if file already exists
    if os.path.exists(output_path):
        print(f"File already exists for {country}, skipping download")
        continue
    
    try:
        print(f"Processing country: {country}")
        # Load the dataset for this country
        country_dataset = load_dataset("bkoyuncu/MacroEcon", config_name)
        
        # Convert to DataFrame and save as CSV
        country_df = pd.DataFrame(country_dataset['train'])
        country_df.to_csv(output_path, index=False)
        print(f"Saved {country} data to {output_path}")
    except Exception as e:
        print(f"Error loading dataset for {country}: {e}")
        continue

## Helper Functions

Define functions to process the data and create datasets.

In [None]:
def flatten_list_of_lists(list_of_lists):
    """
    Converts a list of lists into a single flattened list.
    """
    return [item for sublist in list_of_lists for item in sublist]

def get_file_name(file_path):
    """
    Extract filename without extension from path
    """
    base = os.path.basename(file_path)
    return os.path.splitext(base)[0]

def filter_dataframe_by_time_span(df, start_time, end_time):
    """
    Filters the DataFrame based on the given time-span of start and end time indices.
    """
    # Ensure that the index is in datetime format
    df.index = pd.to_datetime(df.index)

    # Filter the DataFrame based on the time-span
    filtered_df = df[(df.index >= start_time) & (df.index <= end_time)]
    return filtered_df

def extract_distinct_features(columns):
    """
    Extract distinct features from column names with pattern recognition
    """
    # Dictionary to store features and their row numbers
    feature_rows = defaultdict(set)
    
    # Extract base feature and row for each column
    for col in columns:
        # Split the column name by underscore
        parts = col.split('_')
        
        # Take the first two parts as the base feature
        if len(parts) >= 2:
            base_feature = f"{parts[0]}_{parts[1]}"
            
            # Find the row number
            row_match = re.search(r'row_(\d+)', col)
            if row_match:
                row_num = int(row_match.group(1))
                feature_rows[base_feature].add(row_num)
    
    # Create the simplified feature list
    distinct_features = []
    for feature, rows in sorted(feature_rows.items()):
        feature_rows_list = [f"{feature}_row_{row}" for row in sorted(rows)]
        distinct_features.append(feature_rows_list)
    
    return flatten_list_of_lists(distinct_features)

In [None]:
def multivar_example_gen_func(df: pd.DataFrame, feature_names, country_name, df_cov=None, feature_names_cov=None) -> Generator[dict[str, Any], None, None]:
    """
    Generator function for multivariate time series examples
    """
    if df_cov is None:
        df_cov = pd.DataFrame()
    
    if feature_names_cov is None:
        feature_names_cov = []
        
    yield {
        "target": df.to_numpy().T,  # array of shape (var, time)
        "start": df.index[0],
        "freq": pd.infer_freq(df.index),
        "item_id": "item_0",
        "country_name": country_name,
        "column_names": df.columns.to_list(),
        "feature_names": feature_names,
        "feature_names_cov": feature_names_cov
    }

def create_hf_dataset_from_df(df: pd.DataFrame, output_dir: str, feature_names: List[str], country_name: str, df_cov=None, feature_names_cov=None):
    """
    Create Hugging Face dataset from pandas DataFrame
    """
    if df_cov is None:
        df_cov = pd.DataFrame()
    
    if feature_names_cov is None:
        feature_names_cov = []
    
    features = Features(
        dict(
            target=Sequence(
                Sequence(Value("float32")), length=len(df.columns)
            ),  # multivariate time series are saved as (var, time)
            start=Value("timestamp[s]"),
            freq=Value("string"),
            item_id=Value("string"),
            country_name=Value("string"),
            column_names=Sequence(Value("string")),
            feature_names=Sequence(Value("string")),
            feature_names_cov=Sequence(Value("string"))
        )
    )
    
    hf_dataset = Dataset.from_generator(
        lambda: multivar_example_gen_func(
            df=df, 
            feature_names=feature_names, 
            country_name=country_name, 
            df_cov=df_cov,
            feature_names_cov=feature_names_cov
        ), 
        features=features
    )
    
    hf_dataset.save_to_disk(output_dir)
    print(f"Dataset saved to {output_dir}")

## Create Training Splits

Let's define our time-based training splits and process each country's data.

In [None]:
# Define training splits by date ranges
train_splits = {
    1: {'start_time': '1984-01-01', 'end_time': '1994-12-31'},  # 80s-early 90s
    2: {'start_time': '1996-01-01', 'end_time': '2004-12-31'},  # late 90s-early 2000s
    3: {'start_time': '2006-01-01', 'end_time': '2014-12-31'},  # pre-financial crisis to mid-2010s
    4: {'start_time': '2016-01-01', 'end_time': '2022-12-31'}   # recent years
}

print("Defined training splits:")
for split_idx, dates in train_splits.items():
    print(f"Split {split_idx}: {dates['start_time']} to {dates['end_time']}")

In [None]:
# Get a list of all CSV files in the directory
all_csv_files = glob.glob(os.path.join(csv_dataset_path, "*.csv"))
print(f"Found {len(all_csv_files)} CSV files to process")

# Display first few files
if all_csv_files:
    print(f"Sample files: {[os.path.basename(f) for f in all_csv_files[:3]]}")

In [None]:
# Process each country's data and create training splits
for csv_file in all_csv_files:
    print(f"\nProcessing file: {os.path.basename(csv_file)}")
    
    # Read the CSV file
    df = pd.read_csv(csv_file, index_col=0, parse_dates=True)
    
    # Extract country name from filename
    country_name = get_file_name(csv_file).split('_')[0]
    print(f"Country: {country_name}, Data shape: {df.shape}")
    
    # Create each split for this country
    for split_idx, dates in train_splits.items():
        print(f"\nCreating split {split_idx} for {country_name} with date range: {dates}")
        
        # Filter data by date range
        df_split = filter_dataframe_by_time_span(df, **dates)
        
        # Skip if no data in this time range
        if df_split.empty:
            print(f"No data available for {country_name} in time range {dates}")
            continue
            
        print(f"Split data shape: {df_split.shape}")
        
        # Extract features
        features = extract_distinct_features(df_split.columns)
        print(f"Extracted {len(features)} features")
        
        # Create output directory
        output_dir = os.path.join(train_val_path, f"{country_name}_split_{split_idx}")
        
        # Create and save dataset
        create_hf_dataset_from_df(
            df_split, 
            output_dir=output_dir, 
            feature_names=features, 
            country_name=country_name,
            df_cov=None, 
            feature_names_cov=[]
        )

## Verify Dataset Creation

Let's check that the datasets were created correctly by loading and examining one.

In [None]:
# Try to load US split 1 as an example
us_split_1_path = os.path.join(train_val_path, "us_split_1")

if os.path.exists(us_split_1_path):
    try:
        ds_multi = Dataset.load_from_disk(us_split_1_path).with_format("numpy")
        print(f"Dataset loaded successfully with {len(ds_multi)} examples")
        print(f"Available keys: {list(ds_multi[0].keys())}")
        print(f"Country: {ds_multi[0]['country_name']}")
        print(f"Target shape: {ds_multi[0]['target'].shape}")
        print(f"Number of features: {len(ds_multi[0]['feature_names'])}")
        print(f"Start date: {ds_multi[0]['start']}")
        print(f"Frequency: {ds_multi[0]['freq']}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
else:
    print(f"Dataset not found at {us_split_1_path}")
    # Try to find any dataset that was created
    datasets = glob.glob(os.path.join(train_val_path, "*_split_*"))
    if datasets:
        print(f"Other datasets available: {[os.path.basename(d) for d in datasets[:3]]}")
    else:
        print("No datasets were created")

## Sample Data Visualization

Let's visualize some of the macroeconomic time series data.

In [None]:
# Try to plot some time series from the dataset
import matplotlib.pyplot as plt
import numpy as np

try:
    # Get dataset path - try US or use any available dataset
    dataset_paths = glob.glob(os.path.join(train_val_path, "*_split_*"))
    if not dataset_paths:
        print("No datasets available for visualization")
    else:
        dataset_path = dataset_paths[0]
        country = os.path.basename(dataset_path).split('_split_')[0]
        print(f"Visualizing data for {country} from {dataset_path}")
        
        # Load the dataset
        ds = Dataset.load_from_disk(dataset_path).with_format("numpy")
        
        # Get the time series data
        target = ds[0]['target']  # Shape: (variables, time)
        feature_names = ds[0]['feature_names']
        
        # Create time index
        start_date = pd.to_datetime(ds[0]['start'])
        freq = ds[0]['freq']
        time_index = pd.date_range(start=start_date, periods=target.shape[1], freq=freq)
        
        # Plot a few key indicators
        plt.figure(figsize=(14, 10))
        
        # Look for GDP, inflation, and unemployment features
        key_indicators = []
        for i, name in enumerate(feature_names):
            if any(term in name.lower() for term in ['gdp', 'inflation', 'unemploy', 'interest']):
                key_indicators.append((i, name))
        
        # If we found key indicators, plot them
        if key_indicators:
            for i, (idx, name) in enumerate(key_indicators[:4]):  # Plot up to 4 indicators
                plt.subplot(2, 2, i+1)
                plt.plot(time_index, target[idx], label=name)
                plt.title(name)
                plt.xlabel('Date')
                plt.grid(True)
        else:
            # Otherwise plot the first 4 variables
            for i in range(min(4, len(feature_names))):
                plt.subplot(2, 2, i+1)
                plt.plot(time_index, target[i], label=feature_names[i])
                plt.title(feature_names[i])
                plt.xlabel('Date')
                plt.grid(True)
                
        plt.tight_layout()
        plt.show()
        
except Exception as e:
    print(f"Error visualizing data: {e}")

## Summary

The dataset processing is now complete. Here's what we accomplished:

1. Downloaded country-specific macroeconomic data from Hugging Face
2. Saved the raw data as CSV files in `MACRO_ECON_PATH/csv_dataset`
3. Split the data into time-based training partitions
4. Created Hugging Face datasets in `CUSTOM_DATA_PATH/train_val_split`
5. Verified the datasets were created correctly

These datasets are now ready to be used for training and evaluating the MacroTS model.