Test for models.

Linear Model test

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

In [4]:
# Code paths 
sys.path.append('../Models')
from Linear import LinearRegression

In [5]:
Data_train = pd.read_csv(
    "../Data/train.csv",
    parse_dates=["Date"]
)

Data_test = pd.read_csv(
    "../Data/test.csv",
    parse_dates=["Date"]
)

2 Cross Validation technics

In [6]:
def cross_validation_rolling(X, Y, model, n_splits=5):
    """
    Cross-validation on a rolling basis
    [--**      ]
    [----**    ]
    [------**  ]
    [--------**]
    """
    fold_size = len(X) // n_splits
    errors = []

    for i in range(n_splits):
        X_train = X.iloc[:fold_size * (i + 1)]
        Y_train = Y.iloc[:fold_size * (i + 1)]
        X_val = X.iloc[fold_size * (i + 1):fold_size * (i + 2)]
        Y_val = Y.iloc[fold_size * (i + 1):fold_size * (i + 2)]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_val)

        error = np.mean(np.abs(Y_val - Y_pred))
        errors.append(error)

    final_error = np.mean(errors)
    return final_error

def cross_validation_blocked(X, Y, model, n_splits=5, train_ratio=0.7):
    """
    blocked cross-validation
    [---**      ]
    [  ---**    ]
    [    ---**  ]
    [      ---**]
    """
    assert train_ratio < 1.0 and train_ratio > 0, "train_ratio must be between 0 and 1"

    block_size = len(X) // (n_splits + 1)
    train_size = int(train_ratio * block_size)
    errors = []

    for i in range(n_splits):
        start_block = i * block_size
        end_block = start_block + block_size

        X_train = X.iloc[start_block: start_block + train_size]
        Y_train = Y.iloc[start_block: start_block + train_size]
        X_val = X.iloc[start_block + train_size:end_block]
        Y_val = Y.iloc[start_block + train_size:end_block]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_val)

        error = np.mean(np.abs(Y_val - Y_pred))
        errors.append(error)

    final_error = np.mean(errors)
    return final_error



In [15]:
def normalize(X, mean=None, std=None):
    """Normalize continuous features only.

    - If X is a DataFrame: standardize non-binary columns (mean/std), leave 0/1 columns unchanged.
      If mean/std are provided, they are assumed to be pandas Series computed on train and are reused.

    - If X is a numpy array: keeps the previous behavior (standardize all columns).
    """
    # DataFrame path: feature-aware
    if isinstance(X, pd.DataFrame):
        X_norm = X.copy()

        # If stats provided, reuse them (train -> test)
        if mean is not None and std is not None:
            scale_cols = list(mean.index)
        else:
            # Auto-detect binary / one-hot columns: values subset of {0, 1}
            scale_cols = []
            for col in X_norm.columns:
                s = X_norm[col]

                # Treat bool as binary passthrough
                if s.dtype == bool:
                    continue

                vals = pd.unique(s.dropna())
                if len(vals) <= 2 and set(vals).issubset({0, 1}):
                    continue

                # Otherwise: continuous -> scale
                scale_cols.append(col)

            mean = X_norm[scale_cols].mean(axis=0)
            std = X_norm[scale_cols].std(axis=0)

        # Avoid division by zero
        std = std.replace(0, 1.0)

        # Scale only continuous columns
        if len(scale_cols) > 0:
            X_norm[scale_cols] = (X_norm[scale_cols] - mean) / std

        return X_norm, mean, std

    # Numpy path: previous behavior
    if mean is None:
        mean = X.mean(axis=0)
    if std is None:
        std = X.std(axis=0)
    std[std == 0] = 1.0  # avoid X/0
    X_norm = (X - mean) / std
    return X_norm, mean, std

In [16]:
def submit_predictions(ids, predictions, filename = None):
    """
    create a submission file if the file does not exist or just overwrite it
    """
    submission_df = pd.DataFrame({
        "Id": ids,
        "Net_demand": predictions
    })
    if filename is None:
        filename = "submission"
    filename = filename + ".csv"

    filepath = "../Results/" + filename

    if os.path.exists(filepath):
        print(f"File {filename} already exists. Overwriting...")
    else:
        print(f"Creating submission file {filename}...")

    submission_df.to_csv(filepath, index=False)
    print(f"Submission file {filename} created.")

Columns in train but not in test: **{'Solar_power', 'Wind_power', 'Load', 'Net_demand'}**
Columns in test but not in train: **{'Usage', 'Id'}**

In [17]:
X_train = Data_train.drop(columns=["Net_demand", "Date", "Solar_power", "Wind_power", "Load"])
y_train = Data_train["Net_demand"]
X_test = Data_test.drop(columns=["Date", "Usage", "Id"])

# One-hot encode WeekDays (drops Monday=0 as reference category)
X_train = pd.get_dummies(X_train, columns=['WeekDays'], prefix='WeekDays', drop_first=True, dtype=float)
X_test = pd.get_dummies(X_test, columns=['WeekDays'], prefix='WeekDays', drop_first=True, dtype=float)

# Ensure test has same columns as train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0.0)

y_train_np = y_train.values

In [18]:
X_train

Unnamed: 0,Load.1,Load.7,Temp,Temp_s95,Temp_s99,Temp_s95_min,Temp_s95_max,Temp_s99_min,Temp_s99_max,Wind,...,Wind_power.1,Wind_power.7,Net_demand.1,Net_demand.7,WeekDays_1,WeekDays_2,WeekDays_3,WeekDays_4,WeekDays_5,WeekDays_6
0,76353.208333,78166.125000,276.243539,276.528356,275.983263,276.143112,276.914295,275.875080,276.116254,3.591094,...,1051.125000,1051.125000,68453.000000,68453.000000,0.0,0.0,0.0,0.0,1.0,0.0
1,69902.979167,75368.020833,276.945418,276.480771,276.092675,275.053924,278.136641,275.755785,276.505025,3.061055,...,1051.125000,1248.062500,68453.000000,63047.895833,0.0,0.0,0.0,0.0,0.0,1.0
2,64929.250000,80191.604167,280.044604,278.478491,276.868222,275.767389,281.543954,276.169372,278.000776,3.973550,...,1248.062500,2807.708333,63047.895833,65816.041667,0.0,0.0,0.0,0.0,0.0,0.0
3,69275.437500,79539.187500,282.979150,282.078125,278.864944,280.533954,283.842078,278.025304,279.995542,5.167031,...,2807.708333,2991.416667,65816.041667,63442.854167,1.0,0.0,0.0,0.0,0.0,0.0
4,66720.000000,78255.416667,283.428551,283.405606,280.646612,282.617220,284.141240,280.019945,281.344361,3.575167,...,2991.416667,1454.041667,63442.854167,62736.583333,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466,39762.375000,36820.104167,294.952582,294.582449,294.960018,292.424136,296.712824,294.530955,295.348607,3.565889,...,2362.083333,1395.229167,34342.291667,32833.187500,0.0,0.0,0.0,0.0,0.0,1.0
3467,37743.625000,42415.895833,295.949739,295.587284,295.177006,293.374627,297.720791,294.680368,295.693139,3.564721,...,3594.791667,1797.166667,30838.208333,38011.687500,0.0,0.0,0.0,0.0,0.0,0.0
3468,44106.250000,44445.312500,295.280122,295.672056,295.429284,294.182596,296.959805,295.104115,295.715848,3.084633,...,3826.958333,1201.541667,37584.791667,40434.895833,1.0,0.0,0.0,0.0,0.0,0.0
3469,45844.604167,45281.604167,294.679722,294.850625,295.212269,293.267581,296.218931,294.902646,295.509730,3.424414,...,2671.770833,1250.208333,40491.166667,40670.250000,0.0,1.0,0.0,0.0,0.0,0.0


In [19]:
# Feature-aware normalization (train stats reused for test)
X_train, X_mean, X_std = normalize(X_train)
X_test, _, _ = normalize(X_test, mean=X_mean, std=X_std)

# Convert to numpy for the model (training logic unchanged)
X_train_np = X_train.values
X_test_np = X_test.values

In [20]:
X_train_np

array([[ 2.15389225,  2.30960997, -1.61167678, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.54913674,  2.04823576, -1.49751429, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.08281377,  2.49881236, -0.99342359, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.86949265, -0.84029084,  1.48467298, ...,  0.        ,
         0.        ,  0.        ],
       [-0.70650941, -0.76217185,  1.38701631, ...,  0.        ,
         0.        ,  0.        ],
       [-0.74207856, -0.77268645,  1.43636182, ...,  0.        ,
         0.        ,  0.        ]], shape=(3471, 39))

In [21]:
model = LinearRegression(learning_rate=1e-2, maxIter=5000)
model.fit(X_train_np, y_train_np)
y_test_pred = model.predict(X_test_np)

In [22]:
# save csv in ../Results/naive_linear_submission.csv
submit_predictions(Data_test["Id"], y_test_pred, filename="all_params_linear_rmse_submission")


Creating submission file all_params_linear_rmse_submission.csv...
Submission file all_params_linear_rmse_submission.csv created.
