In [36]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

data = pd.read_csv('../data/NYC_Weather_2016_2022.csv')

data['time'] = pd.to_datetime(data['time'])

data.head()

Unnamed: 0,time,temperature_2m (°C),precipitation (mm),rain (mm),cloudcover (%),cloudcover_low (%),cloudcover_mid (%),cloudcover_high (%),windspeed_10m (km/h),winddirection_10m (°)
0,2016-01-01 00:00:00,7.6,0.0,0.0,69.0,53.0,0.0,72.0,10.0,296.0
1,2016-01-01 01:00:00,7.5,0.0,0.0,20.0,4.0,0.0,56.0,9.8,287.0
2,2016-01-01 02:00:00,7.1,0.0,0.0,32.0,3.0,0.0,99.0,9.7,285.0
3,2016-01-01 03:00:00,6.6,0.0,0.0,35.0,5.0,0.0,100.0,9.2,281.0
4,2016-01-01 04:00:00,6.3,0.0,0.0,34.0,4.0,0.0,100.0,9.1,279.0


In [37]:

data['day'] = data['time'].dt.day
data['year'] = data['time'].dt.year
# data['day_of_year'] = data['time'].dt.dayofyear  # 1-365

data['hour_sin'] = np.sin(2 * np.pi * data['time'].dt.hour / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['time'].dt.hour / 24)
data['month_sin'] = np.sin(2 * np.pi * (data['time'].dt.month - 1) / 12)
data['month_cos'] = np.cos(2 * np.pi * (data['time'].dt.month - 1) / 12)

# Create lagged features for temperature
LAGS_NUM = 24
for i in range(1, LAGS_NUM + 1):
    data[f'temp_lag_{i}'] = data['temperature_2m (°C)'].shift(i)

data['target'] = data['temperature_2m (°C)'].shift(-1)

data.dropna(inplace=True)

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numerical_features = ['precipitation (mm)', 'rain (mm)', 'cloudcover (%)', 
                      'cloudcover_low (%)', 'cloudcover_mid (%)', 'cloudcover_high (%)', 
                      'windspeed_10m (km/h)', 'winddirection_10m (°)'] + [f'temp_lag_{i}' for i in range(1, LAGS_NUM + 1)]

data[numerical_features] = scaler.fit_transform(data[numerical_features])

data.head()

Unnamed: 0,time,temperature_2m (°C),precipitation (mm),rain (mm),cloudcover (%),cloudcover_low (%),cloudcover_mid (%),cloudcover_high (%),windspeed_10m (km/h),winddirection_10m (°),...,temp_lag_16,temp_lag_17,temp_lag_18,temp_lag_19,temp_lag_20,temp_lag_21,temp_lag_22,temp_lag_23,temp_lag_24,target
24,2016-01-02 00:00:00,3.6,-0.246422,-0.234733,-0.847534,-0.661924,-0.274548,-0.946749,0.709761,0.65493,...,-0.759576,-0.749379,-0.739179,-0.728976,-0.708579,-0.677982,-0.62699,-0.586199,-0.575998,3.1
25,2016-01-02 01:00:00,3.1,-0.246422,-0.234733,-1.187218,-0.688829,-0.817338,-0.946749,0.778698,0.695034,...,-0.759576,-0.759576,-0.749376,-0.739173,-0.728972,-0.708572,-0.677973,-0.626985,-0.586194,2.9
26,2016-01-02 02:00:00,2.9,-0.246422,-0.234733,-1.187218,-0.688829,-0.817338,-0.946749,0.640824,0.65493,...,-0.871739,-0.759576,-0.759572,-0.74937,-0.739169,-0.728965,-0.708562,-0.677967,-0.626979,2.3
27,2016-01-02 03:00:00,2.3,-0.246422,-0.234733,-1.187218,-0.688829,-0.817338,-0.946749,0.709761,0.634878,...,-0.871739,-0.871739,-0.759572,-0.759566,-0.749365,-0.739161,-0.728955,-0.708556,-0.677961,2.0
28,2016-01-02 04:00:00,2.0,-0.246422,-0.234733,-1.187218,-0.688829,-0.817338,-0.946749,0.709761,0.65493,...,-0.861543,-0.871739,-0.871736,-0.759566,-0.759562,-0.749358,-0.739151,-0.728949,-0.70855,1.9


In [39]:
features = numerical_features + ['temperature_2m (°C)', 'day', 'year', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']

train_data = data[(data['year'] >= 2016) & (data['year'] <= 2020)]
val_data = data[data['year'] == 2021]
test_data = data[data['year'] == 2022]

def df_to_tensor(df, feature_cols, target_col):
    X = torch.tensor(df[feature_cols].values, dtype=torch.float32)
    y = torch.tensor(df[target_col].values, dtype=torch.float32).unsqueeze(1)
    return X, y

X_train, y_train = df_to_tensor(train_data, features, 'target')
X_val, y_val = df_to_tensor(val_data, features, 'target')
X_test, y_test = df_to_tensor(test_data, features, 'target')

### Single-Layer Linear Regression

In [40]:
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

linear_model = LinearRegressionModel(input_dim=X_train.shape[1])

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(linear_model.parameters(), lr=0.01)

EPOCHS = 10000

for epoch in range(EPOCHS):
    linear_model.train()
    y_pred = linear_model(X_train)
    loss = loss_fn(y_pred, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        linear_model.eval()
        val_pred = linear_model(X_val)
        val_loss = loss_fn(val_pred, y_val)
        print(f"Epoch {epoch}: Train Loss = {loss.item():.4f}, Val Loss = {val_loss.item():.4f}")

linear_model.eval()
test_pred = linear_model(X_test)
test_loss = loss_fn(test_pred, y_test)
print(f"Test Loss: {test_loss.item():.4f}")

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Convert predictions and targets to numpy arrays
y_true = y_test.detach().numpy()
y_pred_np = test_pred.detach().numpy().round(1)

mae = mean_absolute_error(y_true, y_pred_np)
rmse = np.sqrt(mean_squared_error(y_true, y_pred_np))
r2 = r2_score(y_true, y_pred_np)
exact_matches = np.mean(np.abs(y_true - y_pred_np) == 0) * 100
within_01 = np.mean(np.abs(y_true - y_pred_np) <= 0.1) * 100
within_05 = np.mean(np.abs(y_true - y_pred_np) <= 0.5) * 100
within_1 = np.mean(np.abs(y_true - y_pred_np) <= 1) * 100
within_2 = np.mean(np.abs(y_true - y_pred_np) <= 2) * 100

print()
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")
print("\nPrediction Accuracy:")
print(f"- Exact matches: {exact_matches:.1f}%")
print(f"- within 0.1: {within_01:.1f}%")
print(f"- within 0.5: {within_05:.1f}%")
print(f"- Within ±1°C: {within_1:.1f}%")
print(f"- Within ±2°C: {within_2:.1f}%")

Epoch 0: Train Loss = 13523.5791, Val Loss = 9260.5928
Epoch 10: Train Loss = 1694.2975, Val Loss = 1969.8350
Epoch 20: Train Loss = 14.1291, Val Loss = 47.9071
Epoch 30: Train Loss = 206.9712, Val Loss = 118.9634
Epoch 40: Train Loss = 127.8408, Val Loss = 113.0841
Epoch 50: Train Loss = 32.6555, Val Loss = 39.0648
Epoch 60: Train Loss = 7.7501, Val Loss = 10.6714
Epoch 70: Train Loss = 4.9746, Val Loss = 5.1119
Epoch 80: Train Loss = 4.8677, Val Loss = 4.4105
Epoch 90: Train Loss = 4.6765, Val Loss = 4.2267
Epoch 100: Train Loss = 4.4305, Val Loss = 4.0463
Epoch 110: Train Loss = 4.2203, Val Loss = 3.8685
Epoch 120: Train Loss = 4.0484, Val Loss = 3.7115
Epoch 130: Train Loss = 3.8989, Val Loss = 3.5677
Epoch 140: Train Loss = 3.7627, Val Loss = 3.4347
Epoch 150: Train Loss = 3.6360, Val Loss = 3.3113
Epoch 160: Train Loss = 3.5171, Val Loss = 3.1956
Epoch 170: Train Loss = 3.4050, Val Loss = 3.0880
Epoch 180: Train Loss = 3.2993, Val Loss = 2.9873
Epoch 190: Train Loss = 3.1995, Val

### 3-Layer Neural Network

In [41]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [42]:
class TemperaturePredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

model = TemperaturePredictor(input_size=len(features))
print(model)

TemperaturePredictor(
  (fc1): Linear(in_features=39, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
)


In [43]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)

In [44]:
def train_model(model, train_loader, val_loader, epochs=50):
    train_losses, val_losses = [], []
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                val_loss += criterion(outputs, y_batch).item()
        
        # Store losses
        train_losses.append(train_loss / len(train_loader))
        val_losses.append(val_loss / len(val_loader))
        scheduler.step(val_losses[-1])
        
        # Print progress
        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_losses[-1]:.4f} | Val Loss: {val_losses[-1]:.4f}")
    
    return train_losses, val_losses

train_losses, val_losses = train_model(model, train_loader, val_loader, epochs=50)

Epoch 5/50 | Train Loss: 5.7790 | Val Loss: 1.5994
Epoch 10/50 | Train Loss: 3.2931 | Val Loss: 2.1640
Epoch 15/50 | Train Loss: 3.0056 | Val Loss: 2.4554
Epoch 20/50 | Train Loss: 3.0116 | Val Loss: 2.6317
Epoch 25/50 | Train Loss: 3.0013 | Val Loss: 2.6678
Epoch 30/50 | Train Loss: 2.9836 | Val Loss: 2.6628
Epoch 35/50 | Train Loss: 2.9942 | Val Loss: 2.6632
Epoch 40/50 | Train Loss: 2.9732 | Val Loss: 2.6635
Epoch 45/50 | Train Loss: 2.9743 | Val Loss: 2.6637
Epoch 50/50 | Train Loss: 2.9977 | Val Loss: 2.6687


In [47]:
model.eval()
test_loss = 0
predictions, actuals = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        test_loss += criterion(outputs, y_batch).item()
        predictions.extend(outputs.numpy().flatten())
        actuals.extend(y_batch.numpy().flatten())

test_loss /= len(test_loader)
print(f"Test Loss (MSE): {test_loss:.4f}")

predictions = np.array(predictions)
actuals = np.array(actuals)

print(f"MAE: {mean_absolute_error(actuals, predictions)}°C")
print(f"RMSE: {np.sqrt(mean_squared_error(actuals, predictions))}°C")
print(f"R²: {r2_score(actuals, predictions)}")

Test Loss (MSE): 3.0566
MAE: 1.516028642654419°C
RMSE: 1.749852651114774°C
R²: 0.9720190167427063
