In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import random
import math
%matplotlib inline

random.seed(1011)
np.random.seed(1011)

In [None]:
def quick_plot(x, y, time_from, time_to, var_from, var_to):
    plt.plot(x[time_from:time_to], y[time_from:time_to, var_from:var_to])

In [None]:
# Warning: this will create a ~1.6GB CSV file and uses quite a bit of RAM...
LENGTH = 10_000_000
NUM_VARS = 20

idxs = np.expand_dims(np.arange(LENGTH), -1)

## Dependent Sine Waves - Spacetimeformer Toy Dataset
The first step is to create variable patterns with a clear relationship to other variables. We do this by making a collection of sine waves with varying frequences and then adding the average of all other variables. This is essentially the "Toy Dataset" from the Spacetimeformer paper but with much lower frequencies. Originally inspired by [(Shih, Sun, and Lee 2019)](https://arxiv.org/abs/1809.04206).

In [None]:
def sin_features(t, i):
  return np.sin((2 * np.pi * i * t) / 10 ** 4).astype(np.float16)

In [None]:
vars = np.repeat(np.expand_dims(np.arange(NUM_VARS), 0), repeats=LENGTH, axis=0) + 1

In [None]:
raw_sine_waves = sin_features(idxs, vars)
raw_sine_waves.shape

In [None]:
summed_sine_features = float(NUM_VARS - 1 / NUM_VARS) * raw_sine_waves + raw_sine_waves.mean(-1, keepdims=True)

The result is a `[-1, 1]` bounded dataset of semi-periodic data where the smaller variable indices have lower frequencies and larger indices have higher frequences. There are also times where global behavior is very unstable and difficult to predict.

In [None]:
quick_plot(idxs, summed_sine_features, 2000, 5000, 0, 5); quick_plot(idxs, summed_sine_features, 2000, 5000, -4, -1); 

In [None]:
quick_plot(idxs, summed_sine_features, 4_000, 8_000, 1, 15)

At this point, assigning arbitrary `datetimes` to the indices would recreate the Spacetimeformer toy dataset. However, we need a very large dataset to test long-sequence models, and this pattern does not seem interesting enough over millions of timesteps.

## Long-Sequence Dataset with Multiple Pattern Resolutions

We create a net positive or negative trend for every variable to make sure the dataset is non-stationary.

In [None]:
global_trends = ((((np.random.random(size=(1, NUM_VARS)) - .5))) / (LENGTH) * idxs).astype(np.float16)

In [None]:
quick_plot(idxs, global_trends, 0, LENGTH, 0, 10)

Next we add "seasonal" patterns with periods >> the dependent sine waves. The seasonal effect has a random lag or offset for each varible.

In [None]:
offset = np.random.randint(low=-LENGTH // 4, high=LENGTH // 4, size=(1, NUM_VARS))
period = np.random.randint(low=4, high=8, size=(1, NUM_VARS))

In [None]:
semiglobal_trends = np.sin((idxs - offset) * (period * math.pi / LENGTH)).astype(np.float16)

We generate random noise where each variable has a slighly different distribution.

In [None]:
random_vars = np.random.random(NUM_VARS)

In [None]:
random_noise = np.random.normal(loc=np.zeros_like(random_vars), scale=random_vars, size=(LENGTH, NUM_VARS)).astype(np.float16)
#random_walks = np.cumsum(random_noise, axis=0)

Now we assign indices to calendar dates. We choose minute intervals so that a 10M length dataset spans about 20 years.

In [None]:
times = []
t = datetime.datetime(year=2000, month=1, day=1, hour=0, minute=0, second=0)
for i in range(LENGTH):
  t += datetime.timedelta(minutes=1)
  times.append(t)
times = np.array(times, dtype=np.datetime64)

Create the final dataset by summing four patterns of increasing resolution:
1. non-stationary trends (spanning entire datasaet)
2. periodic "seasonal" trends (spanning multiple years)
3. dependent sine wave patterns (spanning thousands of minutes)
4. pure random noise

In [None]:
quick_plot(times, global_trends + .1 * semiglobal_trends, 0, LENGTH, 5, 10)

In [None]:
quick_plot(times, global_trends + .1 * semiglobal_trends + .01 * summed_sine_features, 0, LENGTH // 100, 5, 7)

In [None]:
quick_plot(times, global_trends + .1 * semiglobal_trends + 0.01 * summed_sine_features + .005 * random_noise, 0, 10_000, 12, 15)

In [None]:
ts = 5. * (global_trends + .1 * semiglobal_trends + .01 * summed_sine_features + 5e-3 * random_noise)

In [None]:
df_dict = {f"y{i}":ts[:, i] for i in range(NUM_VARS)}; df_dict["Datetime"] = times

In [None]:
df = pd.DataFrame(df_dict); df

In [None]:
df.to_csv("synthetic_lr_dset.csv", index=False)