Test for models.

Linear Model test

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

In [6]:
# Code paths 
sys.path.append('../Models')
from Linear import LinearRegression

In [7]:
Data_train = pd.read_csv(
    "../Data/train.csv",
    parse_dates=["Date"]
)

Data_test = pd.read_csv(
    "../Data/test.csv",
    parse_dates=["Date"]
)

2 Cross Validation technics

In [8]:
def cross_validation_rolling(X, Y, model, n_splits=5):
    """
    Cross-validation on a rolling basis
    [--**      ]
    [----**    ]
    [------**  ]
    [--------**]
    """
    fold_size = len(X) // n_splits
    errors = []

    for i in range(n_splits):
        X_train = X.iloc[:fold_size * (i + 1)]
        Y_train = Y.iloc[:fold_size * (i + 1)]
        X_val = X.iloc[fold_size * (i + 1):fold_size * (i + 2)]
        Y_val = Y.iloc[fold_size * (i + 1):fold_size * (i + 2)]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_val)

        error = np.mean(np.abs(Y_val - Y_pred))
        errors.append(error)

    final_error = np.mean(errors)
    return final_error

def cross_validation_blocked(X, Y, model, n_splits=5, train_ratio=0.7):
    """
    blocked cross-validation
    [---**      ]
    [  ---**    ]
    [    ---**  ]
    [      ---**]
    """
    assert train_ratio < 1.0 and train_ratio > 0, "train_ratio must be between 0 and 1"

    block_size = len(X) // (n_splits + 1)
    train_size = int(train_ratio * block_size)
    errors = []

    for i in range(n_splits):
        start_block = i * block_size
        end_block = start_block + block_size

        X_train = X.iloc[start_block: start_block + train_size]
        Y_train = Y.iloc[start_block: start_block + train_size]
        X_val = X.iloc[start_block + train_size:end_block]
        Y_val = Y.iloc[start_block + train_size:end_block]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_val)

        error = np.mean(np.abs(Y_val - Y_pred))
        errors.append(error)

    final_error = np.mean(errors)
    return final_error



In [9]:
def find_continuous_columns(X):
    """Find continuous columns in a DataFrame X.

    Continuous columns are defined as those that are not binary (0/1) or boolean.

    Returns a list of column names that are considered continuous.
    """
    continuous_cols = []
    for col in X.columns:
        s = X[col]

        # If bool then skip
        if s.dtype == bool:
            continue

        vals = pd.unique(s.dropna())
        if len(vals) <= 2 and set(vals).issubset({0, 1}):
            continue

        # Otherwise has to be "continuous"
        continuous_cols.append(col)

    return continuous_cols

In [10]:
def normalize(X, scale_cols=None):
    """Normalize continuous features only.

    - If X is a DataFrame: standardize non-binary columns (mean/std), leave 0/1 columns unchanged.
    - If scale_cols are provided, they are used for scaling.
    """
    if isinstance(X, pd.DataFrame):
        X_norm = X.copy()

        # If stats are provided then reuse them
        if scale_cols is None:
            # Detect binary / one-hot columns
            scale_cols = find_continuous_columns(X_norm)

        mean = X_norm[scale_cols].mean(axis=0)
        std = X_norm[scale_cols].std(axis=0)

        # Avoid division by zero
        std = std.replace(0, 1.)

        # We sclae only continuous columns
        if len(scale_cols) > 0:
            X_norm[scale_cols] = (X_norm[scale_cols] - mean) / std

        return X_norm, mean, std
    else:
        # Numpy array: scale all columns
        if mean is None:
            mean = np.mean(X, axis=0)
        if std is None:
            std = np.std(X, axis=0)
            std[std == 0] = 1.0  # Avoid division by zero

        X_norm = (X - mean) / std
        return X_norm, mean, std

In [11]:
def denormalize(X_norm, scale_cols, mean, std):
    """Denormalize continuous features only.

    - If X_norm is a DataFrame: denormalize only the columns in scale_cols.
    - If X_norm is a numpy array: denormalize all columns.
    """
    if isinstance(X_norm, pd.DataFrame):
        X_denorm = X_norm.copy()
        if len(scale_cols) > 0:
            X_denorm[scale_cols] = X_denorm[scale_cols] * std + mean
        return X_denorm
    else:
        # Numpy array: denormalize all columns
        X_denorm = X_norm * std + mean
        return X_denorm

In [12]:
def submit_predictions(ids, predictions, filename = None):
    """
    create a submission file if the file does not exist or just overwrite it
    """
    submission_df = pd.DataFrame({
        "Id": ids,
        "Net_demand": predictions
    })
    if filename is None:
        filename = "submission"
    filename = filename + ".csv"

    filepath = "../Results/" + filename

    if os.path.exists(filepath):
        print(f"File {filename} already exists. Overwriting...")
    else:
        print(f"Creating submission file {filename}...")

    submission_df.to_csv(filepath, index=False)
    print(f"Submission file {filename} created.")

Columns in train but not in test: **{'Solar_power', 'Wind_power', 'Load', 'Net_demand'}**
Columns in test but not in train: **{'Usage', 'Id'}**

In [13]:
X_train = Data_train.drop(columns=["Net_demand", "Date", "Solar_power", "Wind_power", "Load"])
y_train = Data_train["Net_demand"]
X_test = Data_test.drop(columns=["Date", "Usage", "Id"])

# One-hot encode WeekDays (drops Monday=0 as reference category)
X_train = pd.get_dummies(X_train, columns=['WeekDays'], prefix='WeekDays', drop_first=True, dtype=float)
X_test = pd.get_dummies(X_test, columns=['WeekDays'], prefix='WeekDays', drop_first=True, dtype=float)

# Ensure test has same columns as train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0.0)

y_train_np = y_train.values

In [14]:
X_train

Unnamed: 0,Load.1,Load.7,Temp,Temp_s95,Temp_s99,Temp_s95_min,Temp_s95_max,Temp_s99_min,Temp_s99_max,Wind,...,Wind_power.1,Wind_power.7,Net_demand.1,Net_demand.7,WeekDays_1,WeekDays_2,WeekDays_3,WeekDays_4,WeekDays_5,WeekDays_6
0,76353.208333,78166.125000,276.243539,276.528356,275.983263,276.143112,276.914295,275.875080,276.116254,3.591094,...,1051.125000,1051.125000,68453.000000,68453.000000,0.0,0.0,0.0,0.0,1.0,0.0
1,69902.979167,75368.020833,276.945418,276.480771,276.092675,275.053924,278.136641,275.755785,276.505025,3.061055,...,1051.125000,1248.062500,68453.000000,63047.895833,0.0,0.0,0.0,0.0,0.0,1.0
2,64929.250000,80191.604167,280.044604,278.478491,276.868222,275.767389,281.543954,276.169372,278.000776,3.973550,...,1248.062500,2807.708333,63047.895833,65816.041667,0.0,0.0,0.0,0.0,0.0,0.0
3,69275.437500,79539.187500,282.979150,282.078125,278.864944,280.533954,283.842078,278.025304,279.995542,5.167031,...,2807.708333,2991.416667,65816.041667,63442.854167,1.0,0.0,0.0,0.0,0.0,0.0
4,66720.000000,78255.416667,283.428551,283.405606,280.646612,282.617220,284.141240,280.019945,281.344361,3.575167,...,2991.416667,1454.041667,63442.854167,62736.583333,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466,39762.375000,36820.104167,294.952582,294.582449,294.960018,292.424136,296.712824,294.530955,295.348607,3.565889,...,2362.083333,1395.229167,34342.291667,32833.187500,0.0,0.0,0.0,0.0,0.0,1.0
3467,37743.625000,42415.895833,295.949739,295.587284,295.177006,293.374627,297.720791,294.680368,295.693139,3.564721,...,3594.791667,1797.166667,30838.208333,38011.687500,0.0,0.0,0.0,0.0,0.0,0.0
3468,44106.250000,44445.312500,295.280122,295.672056,295.429284,294.182596,296.959805,295.104115,295.715848,3.084633,...,3826.958333,1201.541667,37584.791667,40434.895833,1.0,0.0,0.0,0.0,0.0,0.0
3469,45844.604167,45281.604167,294.679722,294.850625,295.212269,293.267581,296.218931,294.902646,295.509730,3.424414,...,2671.770833,1250.208333,40491.166667,40670.250000,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
scale_cols = find_continuous_columns(X_train)
# Feature-aware normalization (train stats reused for test)
X_train, X_mean, X_std = normalize(X_train, scale_cols=scale_cols)
X_test, _, _ = normalize(X_test, scale_cols=scale_cols)

# Convert to numpy for the model (training logic is unchanged)
X_train_np = X_train.values
X_test_np = X_test.values

In [16]:
X_train_np

array([[ 2.15389225,  2.30960997, -1.61167678, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.54913674,  2.04823576, -1.49751429, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.08281377,  2.49881236, -0.99342359, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.86949265, -0.84029084,  1.48467298, ...,  0.        ,
         0.        ,  0.        ],
       [-0.70650941, -0.76217185,  1.38701631, ...,  0.        ,
         0.        ,  0.        ],
       [-0.74207856, -0.77268645,  1.43636182, ...,  0.        ,
         0.        ,  0.        ]], shape=(3471, 39))

In [11]:
model = LinearRegression(learning_rate=0.02, maxIter=10000)
model.fit(X_train_np, y_train_np, loss="rmse")
y_test_pred = model.predict(X_test_np)

In [12]:
y_test_pred

array([41274.37416549, 34882.40382455, 30973.47912552, 38211.30525795,
       40098.87002389, 41069.78471989, 39280.3482329 , 36232.26687203,
       32571.03374039, 31939.85329955, 37824.01202565, 38439.73514865,
       40091.480162  , 40902.08104234, 40067.67429255, 34742.67739648,
       31987.17107452, 39611.15147547, 41155.75811798, 41235.60983157,
       41818.64620212, 41411.6537338 , 35482.79599722, 33427.41100562,
       38180.15686962, 37294.36372798, 39134.55826496, 43297.69882267,
       41237.88649855, 31427.98972256, 30969.5864827 , 41833.73655594,
       43019.89894523, 38879.26362534, 41224.89897319, 41292.32231924,
       36540.56764161, 34233.07640857, 40421.84589501, 42278.68698917,
       42652.58787173, 42113.32167637, 39250.47266748, 32559.81706694,
       30365.0398756 , 38702.7348861 , 39161.20543173, 37877.72513111,
       37477.59076847, 36483.31999364, 32106.09509586, 27826.51366595,
       34034.74207975, 37854.84066134, 37603.56969538, 36536.43887227,
      

In [13]:
# save csv in ../Results/naive_linear_submission.csv
# submit_predictions(Data_test["Id"], y_test_pred, filename="all_params_linear_rmse_submission")


In [17]:
q = 0.8  # quantile for pinball loss
model = LinearRegression(learning_rate=0.02, maxIter=17000, tau=q)
model.fit(X_train_np, y_train_np, loss="pinball", verbose=True, log_every=500)
y_test_pred = model.predict(X_test_np)
y_test_pred

[pinball] iter=0 loss=39289.739753 mean(y_hat)=0.000 mean(y)=49112.175 frac(y_hat>=y)=0.000


[pinball] iter=500 loss=7344.901480 mean(y_hat)=43637.345 mean(y)=49112.175 frac(y_hat>=y)=0.366
[pinball] iter=1000 loss=2742.964749 mean(y_hat)=54027.563 mean(y)=49112.175 frac(y_hat>=y)=0.667
[pinball] iter=1500 loss=1976.038438 mean(y_hat)=54093.693 mean(y)=49112.175 frac(y_hat>=y)=0.743
[pinball] iter=2000 loss=1668.590723 mean(y_hat)=53430.326 mean(y)=49112.175 frac(y_hat>=y)=0.755
[pinball] iter=2500 loss=1528.029979 mean(y_hat)=53243.915 mean(y)=49112.175 frac(y_hat>=y)=0.769
[pinball] iter=3000 loss=1429.464313 mean(y_hat)=53053.664 mean(y)=49112.175 frac(y_hat>=y)=0.775
[pinball] iter=3500 loss=1340.279373 mean(y_hat)=52825.054 mean(y)=49112.175 frac(y_hat>=y)=0.777
[pinball] iter=4000 loss=1255.552495 mean(y_hat)=52547.971 mean(y)=49112.175 frac(y_hat>=y)=0.779
[pinball] iter=4500 loss=1174.659079 mean(y_hat)=52265.759 mean(y)=49112.175 frac(y_hat>=y)=0.780
[pinball] iter=5000 loss=1097.873576 mean(y_hat)=52026.493 mean(y)=49112.175 frac(y_hat>=y)=0.779
[pinball] iter=5500 l

array([41755.43229281, 35553.04679957, 31910.45632431, 38842.26523572,
       41012.4717809 , 41885.28581698, 40281.94746404, 37648.13420103,
       33763.66069721, 32992.12539391, 38915.62126876, 39697.89396187,
       41164.41369282, 41749.46059143, 41082.78987031, 36009.31076911,
       33254.71574426, 40796.76484921, 42901.01541462, 43087.58860354,
       43301.05737543, 42713.48052073, 36566.62233162, 34560.74714983,
       39309.97968594, 39057.8450932 , 40774.43383281, 44521.72048869,
       43114.58184002, 33603.49235872, 32675.36587726, 42792.77197708,
       44609.90215526, 41021.84326149, 42899.24880076, 42965.87824681,
       38112.0902175 , 35803.38902974, 41671.68170745, 43755.31221275,
       44184.47296121, 43491.33425973, 40867.37515697, 34339.0956495 ,
       31973.94788103, 39935.96437055, 40867.17168982, 39684.5529868 ,
       39287.88067044, 38313.31152878, 33461.650204  , 29304.58648229,
       35341.48675945, 39228.11626627, 39251.2947491 , 38167.81397827,
      

In [18]:
# Feature relevance (no retraining): rank coefficients by |value|

feature_names = list(X_train.columns)
coefs = np.asarray(model.weights).reshape(-1)
assert len(feature_names) == len(coefs), (len(feature_names), len(coefs))

# Binary / one-hot columns are exactly 0/1 after our feature-aware normalization
is_binary = []
for c in feature_names:
    vals = pd.unique(X_train[c].dropna())
    is_binary.append(len(vals) <= 2 and set(vals).issubset({0, 1}))

rank_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs,
    "abs_coef": np.abs(coefs),
    "type": np.where(is_binary, "binary/one-hot", "continuous (normalized)")
}).sort_values("abs_coef", ascending=False)

rank_df.head(40)

Unnamed: 0,feature,coef,abs_coef,type
26,BH_Holiday,-6515.135477,6515.135477,continuous (normalized)
38,WeekDays_6,5857.14,5857.14,binary/one-hot
36,WeekDays_4,-4992.86,4992.86,binary/one-hot
35,WeekDays_3,-4430.22,4430.22,binary/one-hot
37,WeekDays_5,4349.78,4349.78,binary/one-hot
34,WeekDays_2,-4156.32,4156.32,binary/one-hot
0,Load.1,4147.721623,4147.721623,continuous (normalized)
31,Net_demand.1,3949.812541,3949.812541,continuous (normalized)
16,BH_after,3696.06,3696.06,binary/one-hot
15,BH,3696.06,3696.06,binary/one-hot


In [19]:
submit_predictions(Data_test["Id"], y_test_pred, filename="all_params_linear_pinball_submission")

File all_params_linear_pinball_submission.csv already exists. Overwriting...
Submission file all_params_linear_pinball_submission.csv created.


# Parameters removed or added

In [17]:
# submit_predictions(Data_test["Id"], y_test_pred, filename="month_dropped_linear_pinball_submission")

In [20]:
# Automatic remove and show stats
def retrain_after_dropping(
    X_train,
    X_test,
    y_train_np,
    drop_features,
    q,
    learning_rate=0.02,
    maxIter=17000,
    verbose=True
):
    # Drop features
    X_train_red = X_train.drop(columns=drop_features)
    X_test_red = X_test.drop(columns=drop_features)

    # Convert to numpy
    X_train_np = X_train_red.values
    X_test_np = X_test_red.values

    # Train model
    model = LinearRegression(
        learning_rate=learning_rate,
        maxIter=maxIter,
        tau=q
    )
    model.fit(
        X_train_np,
        y_train_np,
        loss="pinball",
        verbose=verbose,
        log_every=500
    )

    # Predictions (optional but useful)
    y_test_pred = model.predict(X_test_np)

    # Coefficient analysis
    feature_names = list(X_train_red.columns)
    coefs = np.asarray(model.weights).reshape(-1)

    is_binary = []
    for c in feature_names:
        vals = pd.unique(X_train_red[c].dropna())
        is_binary.append(len(vals) <= 2 and set(vals).issubset({0, 1}))

    rank_df = pd.DataFrame({
        "feature": feature_names,
        "coef": coefs,
        "abs_coef": np.abs(coefs),
        "type": np.where(is_binary, "binary/one-hot", "continuous (normalized)")
    }).sort_values("abs_coef", ascending=False)

    return model, rank_df, y_test_pred


In [22]:
model_no_wind7, rank_no_wind7, y_pred_no_wind7 = retrain_after_dropping(
    X_train, X_test, y_train_np,
    drop_features=["Wind_power.7"],
    q=q
)

print("\nTop coefficients (Wind_power.7):")
print(rank_no_wind7.head(40))


[pinball] iter=0 loss=39289.739753 mean(y_hat)=0.000 mean(y)=49112.175 frac(y_hat>=y)=0.000


[pinball] iter=500 loss=7344.186621 mean(y_hat)=43639.259 mean(y)=49112.175 frac(y_hat>=y)=0.366
[pinball] iter=1000 loss=2754.962171 mean(y_hat)=54032.606 mean(y)=49112.175 frac(y_hat>=y)=0.666
[pinball] iter=1500 loss=1990.154451 mean(y_hat)=54117.859 mean(y)=49112.175 frac(y_hat>=y)=0.746
[pinball] iter=2000 loss=1680.962196 mean(y_hat)=53484.629 mean(y)=49112.175 frac(y_hat>=y)=0.755
[pinball] iter=2500 loss=1539.834279 mean(y_hat)=53281.245 mean(y)=49112.175 frac(y_hat>=y)=0.768
[pinball] iter=3000 loss=1437.694175 mean(y_hat)=53067.610 mean(y)=49112.175 frac(y_hat>=y)=0.774
[pinball] iter=3500 loss=1345.181047 mean(y_hat)=52827.341 mean(y)=49112.175 frac(y_hat>=y)=0.778
[pinball] iter=4000 loss=1258.640438 mean(y_hat)=52569.326 mean(y)=49112.175 frac(y_hat>=y)=0.778
[pinball] iter=4500 loss=1175.791837 mean(y_hat)=52314.701 mean(y)=49112.175 frac(y_hat>=y)=0.779
[pinball] iter=5000 loss=1097.233780 mean(y_hat)=52058.819 mean(y)=49112.175 frac(y_hat>=y)=0.780
[pinball] iter=5500 l

In [23]:
model_no_holyb, rank_no_holyb, y_pred_no_holyb = retrain_after_dropping(
    X_train, X_test, y_train_np,
    drop_features=["Holiday_zone_b", "Wind_power.7"],
    q=q
)

print("\nTop coefficients (Holiday_zone_b + Wind_power.7 removed):")
print(rank_no_holyb.head(40))


[pinball] iter=0 loss=39289.739753 mean(y_hat)=0.000 mean(y)=49112.175 frac(y_hat>=y)=0.000


[pinball] iter=500 loss=6893.143426 mean(y_hat)=43634.680 mean(y)=49112.175 frac(y_hat>=y)=0.350
[pinball] iter=1000 loss=2621.047395 mean(y_hat)=54179.107 mean(y)=49112.175 frac(y_hat>=y)=0.681
[pinball] iter=1500 loss=2001.205059 mean(y_hat)=54358.013 mean(y)=49112.175 frac(y_hat>=y)=0.749
[pinball] iter=2000 loss=1700.803483 mean(y_hat)=53572.291 mean(y)=49112.175 frac(y_hat>=y)=0.759
[pinball] iter=2500 loss=1544.481590 mean(y_hat)=53287.778 mean(y)=49112.175 frac(y_hat>=y)=0.766
[pinball] iter=3000 loss=1437.376284 mean(y_hat)=53037.800 mean(y)=49112.175 frac(y_hat>=y)=0.773
[pinball] iter=3500 loss=1343.729413 mean(y_hat)=52810.846 mean(y)=49112.175 frac(y_hat>=y)=0.776
[pinball] iter=4000 loss=1257.111527 mean(y_hat)=52579.418 mean(y)=49112.175 frac(y_hat>=y)=0.778
[pinball] iter=4500 loss=1174.306009 mean(y_hat)=52309.549 mean(y)=49112.175 frac(y_hat>=y)=0.779
[pinball] iter=5000 loss=1095.650390 mean(y_hat)=52047.744 mean(y)=49112.175 frac(y_hat>=y)=0.780
[pinball] iter=5500 l

In [24]:
submit_predictions(Data_test["Id"], y_test_pred, filename="no_holyb_wind7_linear_pinball_submission")

Creating submission file no_holyb_wind7_linear_pinball_submission.csv...
Submission file no_holyb_wind7_linear_pinball_submission.csv created.


In [None]:
model_no_christ, rank_no_christ, y_pred_no_christ = retrain_after_dropping(
    X_train, X_test, y_train_np,
    drop_features=["Month", "toy", "Christmas_break"],
    q=q
)

print("\nTop coefficients (Month + toy + Christmas_break removed):")
print(rank_no_christ.head(40))

# DENORMALIZE AFTER PREDICTION