### Data Processing & Feature Engineering

In [1]:
import pandas as pd

ten_minute_df = pd.read_csv("jena_climate_2009_2016.csv")

In [20]:
hourly_df = ten_minute_df[5::6] 
hourly_df.index = pd.to_datetime(hourly_df['Date Time'], format='%d.%m.%Y %H:%M:%S')


In [10]:
hourly_df.columns

Index(['Date Time', 'p (mbar)', 'T (degC)', 'Tpot (K)', 'Tdew (degC)',
       'rh (%)', 'VPmax (mbar)', 'VPact (mbar)', 'VPdef (mbar)', 'sh (g/kg)',
       'H2OC (mmol/mol)', 'rho (g/m**3)', 'wv (m/s)', 'max. wv (m/s)',
       'wd (deg)'],
      dtype='object')

In [21]:
hourly_df = hourly_df[['Date Time', 'p (mbar)', 'T (degC)', 'Tdew (degC)',
       'rh (%)', 'VPmax (mbar)']]
hourly_df = hourly_df.rename(columns={'p (mbar)': 'p', 'T (degC)':'temp', 'Tdew (degC)': 'tdew',
       'rh (%)': 'rh', 'VPmax (mbar)': 'vpm'})

In [35]:
hourly_df.head()

nan_count = hourly_df.isna().sum().sum()
print(nan_count)

0


In [26]:
train_df = hourly_df[:60000]
valid_df = hourly_df[60000:65000]
test_df = hourly_df[65000:]

In [39]:
def create_XY(df):
    X = df[['p', 'tdew', 'rh', 'vpm', 'temp']]
    Y = df[['temp']]
    return X, Y

In [42]:
X_train, Y_train = create_XY(train_df)
X_valid, Y_valid = create_XY(valid_df)
X_test, Y_test = create_XY(test_df)

In [43]:
from sklearn.preprocessing import MinMaxScaler

x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

X_train = x_scaler.fit_transform(X_train)
Y_train = y_scaler.fit_transform(Y_train)

X_valid, Y_valid = x_scaler.transform(X_valid), y_scaler.transform(Y_valid)
X_test, Y_test = x_scaler.transform(X_test), y_scaler.transform(Y_test)

In [44]:
import numpy as np
import torch

def build_feature_target_tensors(X, Y, input_window, pred_window):
    """
    Create input and output sequences for model training and return them as PyTorch tensors.
    
    :param X: 2D numpy array where rows are time steps and columns are features, including 'temp'.
    :param Y: 1D numpy array of target variable (temperature), extracted as Y = X[:, temp_index].
    :param input_window: Number of days to use as input sequences.
    :param pred_window: Number of days to predict.
    
    :return: X_tensor, Y_tensor - Tensors of input sequences and corresponding target sequences.
    """
    
    input_hours = input_window * 24
    prediction_hours = pred_window * 24
    
    assert X.shape[0] == Y.shape[0], "X and Y must have the same number of rows."
    
    X_sequences, Y_sequences = [], []
    
    for i in range(0, X.shape[0] - input_hours - prediction_hours + 1, 24):
        x_sequence = X[i:i+input_hours]
        y_sequence = Y[i+input_hours:i+input_hours+prediction_hours]
        
        X_sequences.append(x_sequence)
        Y_sequences.append(y_sequence)
    
    # Convert the lists of sequences to numpy arrays and then to PyTorch tensors
    X_tensor = torch.tensor(np.array(X_sequences), dtype=torch.float)
    Y_tensor = torch.tensor(np.array(Y_sequences), dtype=torch.float)
    
    return X_tensor, Y_tensor


In [45]:
# given 30 days of feature data, make a prediction for temp (hourly) for the next 7 days

input_window = 30
pred_window = 7

In [46]:
X_train_tensor, Y_train_tensor = build_feature_target_tensors(X_train, Y_train, input_window, pred_window)
X_valid_tensor, Y_valid_tensor = build_feature_target_tensors(X_valid, Y_valid, input_window, pred_window)
X_test_tensor, Y_test_tensor = build_feature_target_tensors(X_test, Y_test, input_window, pred_window)

### Model Architecture & Training