In [89]:
import tensorflow as tf
import numpy as np

In [90]:
my_series = np.arange(1, 31)

In [91]:
dataset = tf.keras.utils.timeseries_dataset_from_array(
    my_series,
    targets=my_series[5:],
    sequence_length=5,
    batch_size=3
)

In [92]:
for item in dataset.take(3):
    print(item)

(<tf.Tensor: shape=(3, 5), dtype=int64, numpy=
array([[1, 2, 3, 4, 5],
       [2, 3, 4, 5, 6],
       [3, 4, 5, 6, 7]])>, <tf.Tensor: shape=(3,), dtype=int64, numpy=array([6, 7, 8])>)
(<tf.Tensor: shape=(3, 5), dtype=int64, numpy=
array([[ 4,  5,  6,  7,  8],
       [ 5,  6,  7,  8,  9],
       [ 6,  7,  8,  9, 10]])>, <tf.Tensor: shape=(3,), dtype=int64, numpy=array([ 9, 10, 11])>)
(<tf.Tensor: shape=(3, 5), dtype=int64, numpy=
array([[ 7,  8,  9, 10, 11],
       [ 8,  9, 10, 11, 12],
       [ 9, 10, 11, 12, 13]])>, <tf.Tensor: shape=(3,), dtype=int64, numpy=array([12, 13, 14])>)


2024-12-25 19:11:07.181382: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Another exmaple for Keras time series utility function

In [93]:
my_series = np.arange(1, 101)

In [94]:
dataset = tf.keras.utils.timeseries_dataset_from_array(
    my_series,
    targets=my_series[10:],
    sequence_length=10,
    batch_size=5
)

In [96]:
for item in dataset.take(2):
    print(item)

(<tf.Tensor: shape=(5, 10), dtype=int64, numpy=
array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
       [ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14]])>, <tf.Tensor: shape=(5,), dtype=int64, numpy=array([11, 12, 13, 14, 15])>)
(<tf.Tensor: shape=(5, 10), dtype=int64, numpy=
array([[ 6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       [ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
       [ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
       [ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]])>, <tf.Tensor: shape=(5,), dtype=int64, numpy=array([16, 17, 18, 19, 20])>)


2024-12-25 19:12:22.933623: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Window function for more control

In [110]:
dataset = tf.data.Dataset.range(1, 51).window(5, shift=1, drop_remainder=True)

In [111]:
dataset = dataset.flat_map(lambda window: window.batch(5))

In [113]:
for tensor in dataset.take(5):
    print(tensor)

tf.Tensor([1 2 3 4 5], shape=(5,), dtype=int64)
tf.Tensor([2 3 4 5 6], shape=(5,), dtype=int64)
tf.Tensor([3 4 5 6 7], shape=(5,), dtype=int64)
tf.Tensor([4 5 6 7 8], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)


2024-12-25 19:23:02.970400: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# A dataset to train our RNN

In [114]:
import pandas as pd

In [125]:
df = pd.read_csv('/Users/anilthapa/Downloads/CTA_-_Ridership_-_Daily_Boarding_Totals_20241114.csv')
df.head()

Unnamed: 0,service_date,day_type,bus,rail_boardings,total_rides
0,01/01/2001,U,297192,126455,423647
1,01/02/2001,W,780827,501952,1282779
2,01/03/2001,W,824923,536432,1361355
3,01/04/2001,W,870021,550011,1420032
4,01/05/2001,W,890426,557917,1448343


In [127]:
df['service_date'] = pd.to_datetime(df['service_date'])

In [128]:
df.set_index('service_date', inplace=True)

In [129]:
df.drop(['day_type', 'bus'], axis = 1, inplace=True)

In [130]:
df.sort_index(inplace=True)

In [132]:
df.head()

Unnamed: 0_level_0,rail_boardings,total_rides
service_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2001-01-01,126455,423647
2001-01-02,501952,1282779
2001-01-03,536432,1361355
2001-01-04,550011,1420032
2001-01-05,557917,1448343


In [133]:
df.rename(columns={'rail_boardings': 'rail'}, inplace=True)

In [135]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8706 entries, 2001-01-01 to 2024-08-31
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   rail         8706 non-null   int64
 1   total_rides  8706 non-null   int64
dtypes: int64(2)
memory usage: 204.0 KB


In [136]:
# Split the datasets

In [138]:
rail_train = df['rail']['2016-01':'2018-12'] /1e6
rail_valid = df['rail']['2019-01': '2019-05'] / 1e6
rail_test = df['rail']['2019-06': '2019-12'] /1e6

In [141]:
train_ds = tf.keras.utils.timeseries_dataset_from_array(
    rail_train.to_numpy(),
    targets= rail_train[56:],
    sequence_length=56, 
    batch_size=32, 
    shuffle=True, 
    seed=42
)

In [142]:
valid_ds = tf.keras.utils.timeseries_dataset_from_array(
    rail_valid.to_numpy(),
    targets=rail_valid[56:],
    sequence_length=56,
    batch_size=32
)

# Forecasting using a linear model

In [143]:
tf.random.set_seed(42)

In [144]:
model = tf.keras.Sequential([(tf.keras.layers.Input(shape=[56])), (tf.keras.layers.Dense(1))])

In [145]:
model.compile(loss = 'huber', metrics=['mae'], optimizer = 'adam')

In [146]:
history = model.fit(train_ds, validation_data=valid_ds, epochs=20)

Epoch 1/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.9194 - mae: 1.4156 - val_loss: 0.1546 - val_mae: 0.5201
Epoch 2/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0829 - mae: 0.3522 - val_loss: 0.0189 - val_mae: 0.1603
Epoch 3/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0189 - mae: 0.1612 - val_loss: 0.0180 - val_mae: 0.1565
Epoch 4/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0168 - mae: 0.1516 - val_loss: 0.0173 - val_mae: 0.1537
Epoch 5/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0159 - mae: 0.1463 - val_loss: 0.0162 - val_mae: 0.1499
Epoch 6/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0150 - mae: 0.1434 - val_loss: 0.0153 - val_mae: 0.1463
Epoch 7/20
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0139 -