In [None]:
# internet imports
import os
import datetime
import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

In [None]:
#user imports
import windowGenarator

# Baseline Chatgpt Code

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd

#########################################
# Configuration
#########################################

num_stations = 5
features_per_station = 12  # Example; adjust to actual feature count
input_width = 24           # Example
label_width = 1
shift = 1
label_columns = ["TD (degC)"]
target_station_idx = 0

# Model hyperparameters
cnn_out_dim = 32
d_model = 64
num_heads = 4
num_station_transformer_layers = 2
num_time_transformer_layers = 2

#########################################
# Load Data and Coordinates
#########################################
# Assume these DataFrames and station_coords are already loaded and preprocessed
# train_df, val_df, test_df = ...
# station_coords_df = pd.read_csv('stations_coords.csv')
# Sort by station_id and extract coords in order
# station_coords = station_coords_df[['x', 'y']].values.astype(np.float32)
# Normalize coords if needed

station_coords = np.array([
    [350000, 3200000],
    [351000, 3201000],
    [352000, 3202000],
    [353000, 3203000],
    [354000, 3204000]
], dtype=np.float32)

# Optional: normalize coordinates
# x_min, x_max = station_coords[:,0].min(), station_coords[:,0].max()
# y_min, y_max = station_coords[:,1].min(), station_coords[:,1].max()
# station_coords[:,0] = (station_coords[:,0] - x_min)/(x_max - x_min)
# station_coords[:,1] = (station_coords[:,1] - y_min)/(y_max - y_min)

#########################################
# WindowGenerator (Given by you)
#########################################
# window = WindowGenerator(input_width=input_width,
#                          label_width=label_width,
#                          shift=shift,
#                          train_df=train_df,
#                          val_df=val_df,
#                          test_df=test_df,
#                          label_columns=label_columns)

#########################################
# Model Components
#########################################

def make_station_cnn(features_per_station, cnn_out_dim):
    """
    CNN that processes a single station's time series of shape (batch, input_width, features_per_station).
    We'll keep the time dimension. The output shape: (batch, input_width, cnn_out_dim)
    """
    model = keras.Sequential([
        layers.Conv1D(filters=32, kernel_size=3, padding='causal', activation='relu', input_shape=(input_width, features_per_station)),
        layers.Conv1D(filters=32, kernel_size=3, padding='causal', activation='relu'),
        # No pooling here, we keep the time dimension
        layers.Dense(cnn_out_dim)  # map to cnn_out_dim
    ])
    return model

class TransformerEncoderLayer(layers.Layer):
    def __init__(self, d_model, num_heads, dff=128, dropout=0.1):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, x, training=False):
        # x: (batch, seq_len, d_model)
        attn_output = self.mha(x, x, training=training)  # Self-attention
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

class TransformerEncoder(layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff=128, dropout=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, dropout) for _ in range(num_layers)]

    def call(self, x, training=False):
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training)
        return x

class MultiStationModel(keras.Model):
    def __init__(self, 
                 num_stations, 
                 features_per_station,
                 station_coords, 
                 cnn_out_dim,
                 d_model,
                 num_station_transformer_layers,
                 num_time_transformer_layers,
                 num_heads,
                 target_station_idx=0,
                 label_width=1):
        super().__init__()
        self.num_stations = num_stations
        self.features_per_station = features_per_station
        self.target_station_idx = target_station_idx
        self.label_width = label_width
        self.station_coords = tf.constant(station_coords, dtype=tf.float32) # (num_stations, 2)

        # A CNN for each station:
        # We'll apply the same CNN weights to all stations (shared weights)
        # If you need distinct CNNs per station, create a list of CNNs.
        self.station_cnn = make_station_cnn(features_per_station, cnn_out_dim)

        # Project CNN output + coords to d_model
        self.project = layers.Dense(d_model)

        # Transformer for station dimension
        self.station_transformer = TransformerEncoder(num_station_transformer_layers, d_model, num_heads)

        # Transformer for time dimension (long-term patterns)
        self.time_transformer = TransformerEncoder(num_time_transformer_layers, d_model, num_heads)

        # Final output layer
        self.output_layer = layers.Dense(self.label_width)  # Predict 1 step (or label_width steps)
        
    def call(self, inputs, training=False):
        # inputs: (batch, input_width, total_features)
        # total_features = num_stations * features_per_station
        batch_size = tf.shape(inputs)[0]

        total_features = self.num_stations * self.features_per_station
        # Split the input for each station
        # Shape: (batch, input_width, features_per_station) per station
        station_embeddings_list = []
        for i in range(self.num_stations):
            start = i * self.features_per_station
            end = start + self.features_per_station
            station_data = inputs[:, :, start:end]  # (batch, input_width, features_per_station)
            
            # CNN to extract local patterns
            station_cnn_out = self.station_cnn(station_data, training=training) # (batch, input_width, cnn_out_dim)
            
            # Add coordinates (x, y) to each time step
            # coords shape: (1, 1, 2) expanded to (batch, input_width, 2)
            coords = tf.reshape(self.station_coords[i], (1,1,2))
            coords = tf.tile(coords, [batch_size, tf.shape(station_cnn_out)[1], 1]) # (batch, input_width, 2)
            
            # Concatenate coords
            combined = tf.concat([station_cnn_out, coords], axis=-1) # (batch, input_width, cnn_out_dim+2)
            
            # Project to d_model
            projected = self.project(combined, training=training) # (batch, input_width, d_model)
            station_embeddings_list.append(projected)

        # Now we have a list of station embeddings: each (batch, input_width, d_model)
        # Stack them to form (batch, input_width, num_stations, d_model)
        station_embeddings = tf.stack(station_embeddings_list, axis=2) # (batch, input_width, num_stations, d_model)

        # First Transformer: Station dimension
        # We want to understand station relationships at each time step.
        # So for each time step t, we have (batch, num_stations, d_model).
        # We'll flatten batch and time steps to process them at once:
        
        B = batch_size
        T = input_width
        S = self.num_stations

        # Reshape to (B*T, S, d_model)
        station_embeddings_reshaped = tf.reshape(station_embeddings, (B*T, S, d_model))
        
        # Pass through station-transformer
        station_transformed = self.station_transformer(station_embeddings_reshaped, training=training) # (B*T, S, d_model)

        # Extract target station embedding: (B*T, d_model)
        target_embeddings = station_transformed[:, self.target_station_idx, :] # (B*T, d_model)

        # Reshape back to (B, T, d_model)
        target_time_series = tf.reshape(target_embeddings, (B, T, d_model))

        # Second Transformer: Time dimension
        # Now we model long-term temporal patterns of the target station embedding sequence.
        # Shape: (batch, input_width, d_model)
        time_transformed = self.time_transformer(target_time_series, training=training) # (batch, input_width, d_model)

        # Predict temperature at the forecast horizon
        # Assume we want the last time step embedding for prediction, or entire sequence if label_width > 1
        # Since label_width=1, we take the last time step (e.g. final embedding)
        final_embedding = time_transformed[:, -1, :] # (batch, d_model)
        
        predictions = self.output_layer(final_embedding) # (batch, label_width)
        # Expand dims to (batch, label_width, 1) if needed
        predictions = tf.expand_dims(predictions, -1) # (batch, label_width, 1)

        return predictions

#########################################
# Training Example
#########################################

# Example usage (uncomment and adjust once you have train/val/test data):
# model = MultiStationModel(num_stations=num_stations,
#                           features_per_station=features_per_station,
#                           station_coords=station_coords,
#                           cnn_out_dim=cnn_out_dim,
#                           d_model=d_model,
#                           num_station_transformer_layers=num_station_transformer_layers,
#                           num_time_transformer_layers=num_time_transformer_layers,
#                           num_heads=num_heads,
#                           target_station_idx=target_station_idx,
#                           label_width=label_width)

# model.compile(loss='mse', optimizer='adam', metrics=['mae'])
# history = model.fit(window.train, epochs=10, validation_data=window.val)
# model.evaluate(window.test)

# Get predictions:
# for inputs, labels in window.test.take(1):
#     preds = model(inputs)
#     print("Predictions shape:", preds.shape)  # (batch, label_width, 1)
#     print("Labels shape:", labels.shape)


In [None]:
#user imports
import windowGenarator

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd

In [None]:
#########################################
# Configuration
#########################################

num_stations = 5
features_per_station = 12  # Example; adjust to actual feature count
input_width = 24           # Example
label_width = 1
shift = 1
label_columns = ["TD (degC)"]
target_station_idx = 0

# Model hyperparameters
cnn_out_dim = 32
d_model = 64
num_heads = 4
num_station_transformer_layers = 2
num_time_transformer_layers = 2

In [None]:
#########################################
# Load Data and Coordinates
#########################################
# Assume these DataFrames and station_coords are already loaded and preprocessed
# train_df, val_df, test_df = ...
# station_coords_df = pd.read_csv('stations_coords.csv')
# Sort by station_id and extract coords in order
# station_coords = station_coords_df[['x', 'y']].values.astype(np.float32)
# Normalize coords if needed

station_coords = np.array([
    [350000, 3200000],
    [351000, 3201000],
    [352000, 3202000],
    [353000, 3203000],
    [354000, 3204000]
], dtype=np.float32)

# Optional: normalize coordinates
# x_min, x_max = station_coords[:,0].min(), station_coords[:,0].max()
# y_min, y_max = station_coords[:,1].min(), station_coords[:,1].max()
# station_coords[:,0] = (station_coords[:,0] - x_min)/(x_max - x_min)
# station_coords[:,1] = (station_coords[:,1] - y_min)/(y_max - y_min)

In [None]:
#########################################
# WindowGenerator (Given by you)
#########################################
# window = WindowGenerator(input_width=input_width,
#                          label_width=label_width,
#                          shift=shift,
#                          train_df=train_df,
#                          val_df=val_df,
#                          test_df=test_df,
#                          label_columns=label_columns)

In [None]:
#########################################
# Model Components
#########################################

def make_station_cnn(features_per_station, cnn_out_dim):
    """
    CNN that processes a single station's time series of shape (batch, input_width, features_per_station).
    We'll keep the time dimension. The output shape: (batch, input_width, cnn_out_dim)
    """
    model = keras.Sequential([
        layers.Conv1D(filters=32, kernel_size=3, padding='causal', activation='relu', input_shape=(input_width, features_per_station)),
        layers.Conv1D(filters=32, kernel_size=3, padding='causal', activation='relu'),
        # No pooling here, we keep the time dimension
        layers.Dense(cnn_out_dim)  # map to cnn_out_dim
    ])
    return model

class TransformerEncoderLayer(layers.Layer):
    def __init__(self, d_model, num_heads, dff=128, dropout=0.1):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, x, training=False):
        # x: (batch, seq_len, d_model)
        attn_output = self.mha(x, x, training=training)  # Self-attention
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

class TransformerEncoder(layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff=128, dropout=0.1):
        super().__init__()
        self.num_layers = num_layers
        self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, dropout) for _ in range(num_layers)]

    def call(self, x, training=False):
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training)
        return x

class MultiStationModel(keras.Model):
    def __init__(self, 
                 num_stations, 
                 features_per_station,
                 station_coords, 
                 cnn_out_dim,
                 d_model,
                 num_station_transformer_layers,
                 num_time_transformer_layers,
                 num_heads,
                 target_station_idx=0,
                 label_width=1):
        super().__init__()
        self.num_stations = num_stations
        self.features_per_station = features_per_station
        self.target_station_idx = target_station_idx
        self.label_width = label_width
        self.station_coords = tf.constant(station_coords, dtype=tf.float32) # (num_stations, 2)

        # A CNN for each station:
        # We'll apply the same CNN weights to all stations (shared weights)
        # If you need distinct CNNs per station, create a list of CNNs.
        self.station_cnn = make_station_cnn(features_per_station, cnn_out_dim)

        # Project CNN output + coords to d_model
        self.project = layers.Dense(d_model)

        # Transformer for station dimension
        self.station_transformer = TransformerEncoder(num_station_transformer_layers, d_model, num_heads)

        # Transformer for time dimension (long-term patterns)
        self.time_transformer = TransformerEncoder(num_time_transformer_layers, d_model, num_heads)

        # Final output layer
        self.output_layer = layers.Dense(self.label_width)  # Predict 1 step (or label_width steps)
        
    def call(self, inputs, training=False):
        # inputs: (batch, input_width, total_features)
        # total_features = num_stations * features_per_station
        batch_size = tf.shape(inputs)[0]

        total_features = self.num_stations * self.features_per_station
        # Split the input for each station
        # Shape: (batch, input_width, features_per_station) per station
        station_embeddings_list = []
        for i in range(self.num_stations):
            start = i * self.features_per_station
            end = start + self.features_per_station
            station_data = inputs[:, :, start:end]  # (batch, input_width, features_per_station)
            
            # CNN to extract local patterns
            station_cnn_out = self.station_cnn(station_data, training=training) # (batch, input_width, cnn_out_dim)
            
            # Add coordinates (x, y) to each time step
            # coords shape: (1, 1, 2) expanded to (batch, input_width, 2)
            coords = tf.reshape(self.station_coords[i], (1,1,2))
            coords = tf.tile(coords, [batch_size, tf.shape(station_cnn_out)[1], 1]) # (batch, input_width, 2)
            
            # Concatenate coords
            combined = tf.concat([station_cnn_out, coords], axis=-1) # (batch, input_width, cnn_out_dim+2)
            
            # Project to d_model
            projected = self.project(combined, training=training) # (batch, input_width, d_model)
            station_embeddings_list.append(projected)

        # Now we have a list of station embeddings: each (batch, input_width, d_model)
        # Stack them to form (batch, input_width, num_stations, d_model)
        station_embeddings = tf.stack(station_embeddings_list, axis=2) # (batch, input_width, num_stations, d_model)

        # First Transformer: Station dimension
        # We want to understand station relationships at each time step.
        # So for each time step t, we have (batch, num_stations, d_model).
        # We'll flatten batch and time steps to process them at once:
        
        B = batch_size
        T = input_width
        S = self.num_stations

        # Reshape to (B*T, S, d_model)
        station_embeddings_reshaped = tf.reshape(station_embeddings, (B*T, S, d_model))
        
        # Pass through station-transformer
        station_transformed = self.station_transformer(station_embeddings_reshaped, training=training) # (B*T, S, d_model)

        # Extract target station embedding: (B*T, d_model)
        target_embeddings = station_transformed[:, self.target_station_idx, :] # (B*T, d_model)

        # Reshape back to (B, T, d_model)
        target_time_series = tf.reshape(target_embeddings, (B, T, d_model))

        # Second Transformer: Time dimension
        # Now we model long-term temporal patterns of the target station embedding sequence.
        # Shape: (batch, input_width, d_model)
        time_transformed = self.time_transformer(target_time_series, training=training) # (batch, input_width, d_model)

        # Predict temperature at the forecast horizon
        # Assume we want the last time step embedding for prediction, or entire sequence if label_width > 1
        # Since label_width=1, we take the last time step (e.g. final embedding)
        final_embedding = time_transformed[:, -1, :] # (batch, d_model)
        
        predictions = self.output_layer(final_embedding) # (batch, label_width)
        # Expand dims to (batch, label_width, 1) if needed
        predictions = tf.expand_dims(predictions, -1) # (batch, label_width, 1)

        return predictions

In [None]:
#########################################
# Training Example
#########################################

# Example usage (uncomment and adjust once you have train/val/test data):
# model = MultiStationModel(num_stations=num_stations,
#                           features_per_station=features_per_station,
#                           station_coords=station_coords,
#                           cnn_out_dim=cnn_out_dim,
#                           d_model=d_model,
#                           num_station_transformer_layers=num_station_transformer_layers,
#                           num_time_transformer_layers=num_time_transformer_layers,
#                           num_heads=num_heads,
#                           target_station_idx=target_station_idx,
#                           label_width=label_width)

# model.compile(loss='mse', optimizer='adam', metrics=['mae'])
# history = model.fit(window.train, epochs=10, validation_data=window.val)
# model.evaluate(window.test)

# Get predictions:
# for inputs, labels in window.test.take(1):
#     preds = model(inputs)
#     print("Predictions shape:", preds.shape)  # (batch, label_width, 1)
#     print("Labels shape:", labels.shape)