In [5]:
from preprocessing import preprocessing, IMG_FOLDER
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader

In [6]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(13, 64),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)


In [7]:
def nn_train(data, Y):
    X = data.values
    y = Y.values.reshape(-1, 1)

    # Scale inputs
    scaler_X = StandardScaler()
    X = scaler_X.fit_transform(X)

    # Convert to tensors
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)

    # Parameters
    epochs = 300
    k_folds = 5
    kf = KFold(n_splits=k_folds)

    # Track average values across folds
    train_mse_all = np.zeros(epochs)
    test_mse_all = np.zeros(epochs)
    acc_all = []
    r2_all = []

    # K-fold loop
    for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
        print(f"Fold {fold}")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        batch_size = 64  # typical value: 32–256

        train_dataset = TensorDataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size)

        model = Net()
        optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.01)
        criterion = nn.MSELoss()

        train_mse_fold = []
        test_mse_fold = []

        for epoch in range(epochs):
            model.train()
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                y_pred = model(batch_X)
                loss = criterion(y_pred, batch_y)
                loss.backward()
                optimizer.step()

            # Evaluation
            model.eval()
            with torch.no_grad():
                y_train_pred = model(X_train).detach().numpy()
                y_test_pred = model(X_test).detach().numpy()

                train_mse = mean_squared_error(y_train.numpy(), y_train_pred)
                test_mse = mean_squared_error(y_test.numpy(), y_test_pred)

                train_mse_fold.append(train_mse)
                test_mse_fold.append(test_mse)

        # Add epoch-wise errors for averaging later
        train_mse_all += np.array(train_mse_fold)
        test_mse_all += np.array(test_mse_fold)

        # Final predictions on this fold (for accuracy and R²)
        final_y_pred = model(X_test).detach().numpy()
        final_y_true = y_test.numpy()

        acc = np.mean(np.abs(final_y_true - final_y_pred) <= 150)
        r2 = r2_score(final_y_true, final_y_pred)

        acc_all.append(acc)
        r2_all.append(r2)

    # Average epoch-wise MSE across all folds
    train_mse_avg = train_mse_all / k_folds
    test_mse_avg = test_mse_all / k_folds

    # Plot hockey stick
    plt.figure(figsize=(10, 6))
    plt.plot(train_mse_avg, label='Avg Training MSE')
    plt.plot(test_mse_avg, label='Avg Test MSE')
    plt.xlabel('Epoch')
    plt.ylabel('Mean Squared Error')
    plt.title('K-Fold CV: Average Training vs Test MSE over Epochs')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(IMG_FOLDER, 'nn-test-train-mse.png'))
    plt.show()

    # Final averaged scores
    print("\n=== Final Cross-Validated Performance ===")
    print(f"Average R² Score        : {np.mean(r2_all):.4f}")
    print(f"Average Accuracy ±150   : {np.mean(acc_all) * 100:.2f}%")
    return 1


In [8]:
df = preprocessing()
X = df.drop('PAX', axis=1)
Y = df['PAX']
print(X.columns, len(X.columns))
nn_train(X, Y)

Total rows: 16625
Min of seats: 10
Max of seats: 200
Num of rows where pax will be adjusted to seats 812
Unique status values: [1036 1896  288 1694 2024  963]
Number of unique statuses: 6
Unique route values: 96
Number of unique airports: 41
Der er naturligvis høj correlation mellem airport og routes:  98
     Route  Airport  count
39  163565     7543    764
55  163876     8212    711
12  113314     1957    701


invalid command name "129502150344384process_stream_events"
    while executing
"129502150344384process_stream_events"
    ("after" script)


the current columns are:  Index(['Status', 'Airport', 'Route', 'Seats', 'PAX', 'real_date', 'Year',
       'Month', 'Day', 'Week', 'DayOfWeek', 'DayName', 'PCT_occupied',
       'Avg_PCT_occupied_weekly'],
      dtype='object')
          Status  Airport   Route  Seats  PAX  real_date  Year  Month  Day  \
Date                                                                         
20230101    1036     4255  163560    197  172 2023-01-01  2023      1    1   
20230101    1036     4334  147990    189  171 2023-01-01  2023      1    1   
20230101    1896     2106  110466    197  177 2023-01-01  2023      1    1   
20230101    1036      123  100338    197  192 2023-01-01  2023      1    1   
20230101    1896     7543  163565    189  184 2023-01-01  2023      1    1   
...          ...      ...     ...    ...  ...        ...   ...    ...  ...   
20250331    1036      685  160042    197  172 2025-03-31  2025      3   31   
20250331    1036     7457  170013    189  179 2025-03-31  2025      3 

1