Test for models.

Linear Model test

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

In [3]:
# Code paths 
sys.path.append('../Models')
from Linear import LinearModel

In [4]:
Data_train = pd.read_csv(
    "../Data/train.csv",
    parse_dates=["Date"]
)

Data_test = pd.read_csv(
    "../Data/test.csv",
    parse_dates=["Date"]
)

2 Cross Validation technics

In [5]:
def cross_validation_rolling(X, Y, model, n_splits=5):
    """
    Cross-validation on a rolling basis
    [--**      ]
    [----**    ]
    [------**  ]
    [--------**]
    """
    fold_size = len(X) // n_splits
    errors = []

    for i in range(n_splits):
        X_train = X.iloc[:fold_size * (i + 1)]
        Y_train = Y.iloc[:fold_size * (i + 1)]
        X_val = X.iloc[fold_size * (i + 1):fold_size * (i + 2)]
        Y_val = Y.iloc[fold_size * (i + 1):fold_size * (i + 2)]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_val)

        error = np.mean(np.abs(Y_val - Y_pred))
        errors.append(error)

    final_error = np.mean(errors)
    return final_error

def cross_validation_blocked(X, Y, model, n_splits=5, train_ratio=0.7):
    """
    blocked cross-validation
    [---**      ]
    [  ---**    ]
    [    ---**  ]
    [      ---**]
    """
    assert train_ratio < 1.0 and train_ratio > 0, "train_ratio must be between 0 and 1"

    block_size = len(X) // (n_splits + 1)
    train_size = int(train_ratio * block_size)
    errors = []

    for i in range(n_splits):
        start_block = i * block_size
        end_block = start_block + block_size

        X_train = X.iloc[start_block: start_block + train_size]
        Y_train = Y.iloc[start_block: start_block + train_size]
        X_val = X.iloc[start_block + train_size:end_block]
        Y_val = Y.iloc[start_block + train_size:end_block]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_val)

        error = np.mean(np.abs(Y_val - Y_pred))
        errors.append(error)

    final_error = np.mean(errors)
    return final_error



In [6]:
def find_continuous_columns(X):
    """Find continuous columns in a DataFrame X.

    Continuous columns are defined as those that are not binary (0/1) or boolean.

    Returns a list of column names that are considered continuous.
    """
    continuous_cols = []
    for col in X.columns:
        s = X[col]

        # If bool then skip
        if s.dtype == bool:
            continue

        vals = pd.unique(s.dropna())
        if len(vals) <= 2 and set(vals).issubset({0, 1}):
            continue

        # Otherwise has to be "continuous"
        continuous_cols.append(col)

    return continuous_cols

In [7]:
def normalize(X, scale_cols=None):
    """Normalize continuous features only.

    - If X is a DataFrame: standardize non-binary columns (mean/std), leave 0/1 columns unchanged.
    - If scale_cols are provided, they are used for scaling.
    """
    if isinstance(X, pd.DataFrame):
        X_norm = X.copy()

        # If stats are provided then reuse them
        if scale_cols is None:
            # Detect binary / one-hot columns
            scale_cols = find_continuous_columns(X_norm)

        mean = X_norm[scale_cols].mean(axis=0)
        std = X_norm[scale_cols].std(axis=0)

        # Avoid division by zero
        std = std.replace(0, 1.)

        # We sclae only continuous columns
        if len(scale_cols) > 0:
            X_norm[scale_cols] = (X_norm[scale_cols] - mean) / std

        return X_norm, mean, std
    else:
        # Numpy array: scale all columns
        if mean is None:
            mean = np.mean(X, axis=0)
        if std is None:
            std = np.std(X, axis=0)
            std[std == 0] = 1.0  # Avoid division by zero

        X_norm = (X - mean) / std
        return X_norm, mean, std

In [8]:
def denormalize(X_norm, scale_cols, mean, std):
    """Denormalize continuous features only.

    - If X_norm is a DataFrame: denormalize only the columns in scale_cols.
    - If X_norm is a numpy array: denormalize all columns.
    """
    if isinstance(X_norm, pd.DataFrame):
        X_denorm = X_norm.copy()
        if len(scale_cols) > 0:
            X_denorm[scale_cols] = X_denorm[scale_cols] * std + mean
        return X_denorm
    else:
        # Numpy array: denormalize all columns
        X_denorm = X_norm * std + mean
        return X_denorm

In [9]:
def submit_predictions(ids, predictions, filename = None):
    """
    create a submission file if the file does not exist or just overwrite it
    """
    submission_df = pd.DataFrame({
        "Id": ids,
        "Net_demand": predictions
    })
    if filename is None:
        filename = "submission"
    filename = filename + ".csv"

    filepath = "../Results/" + filename

    if os.path.exists(filepath):
        print(f"File {filename} already exists. Overwriting...")
    else:
        print(f"Creating submission file {filename}...")

    submission_df.to_csv(filepath, index=False)
    print(f"Submission file {filename} created.")

Columns in train but not in test: **{'Solar_power', 'Wind_power', 'Load', 'Net_demand'}**
Columns in test but not in train: **{'Usage', 'Id'}**

In [None]:
import pandas as pd
import numpy as np

# Copie pour éviter les warnings
df = Data_train.copy()

# --- 1. Gestion des Dates et Covid ---
# On s'assure que la date est au format datetime
df['Date'] = pd.to_datetime(df['Date'])

# Variable indicatrice : 1 si on est après le début du 1er confinement (rupture structurelle)
# Date clé : 17 Mars 2020 (début confinement France)
df['Covid_Flag'] = (df['Date'] >= '2020-03-17').astype(int)


# --- 2. Variables Thermiques (Chauffage / Clim) ---
# Votre température est en Kelvin (ex: 276 K = 3°C).
# 0°C = 273.15 K
kelvin_0 = 273.15

# Seuils usuels en électricité (environ 15°C pour allumer le chauffage, 22°C pour la clim)
seuil_chauffage = kelvin_0 + 15  # 288.15 K
seuil_clim      = kelvin_0 + 22  # 295.15 K

# A. Indicateurs simples (0 ou 1)
df['Heating_Flag'] = (df['Temp'] < seuil_chauffage).astype(float)
df['Cooling_Flag'] = (df['Temp'] > seuil_clim).astype(float)

# B. (Optionnel mais recommandé) Intensité du besoin (Degrés-Jours)
# Plus précis qu'un simple 0/1 car cela capture "combien" il fait froid
df['Heating_Degree'] = df['Temp'].apply(lambda x: max(0, seuil_chauffage - x))
df['Cooling_Degree'] = df['Temp'].apply(lambda x: max(0, x - seuil_clim))


# --- 3. Nettoyage et Sélection (Drop) ---
# On ajoute "Nebulosity" à la liste des suppressions
cols_to_drop = [
    "Net_demand", "Date", "Solar_power", "Wind_power", "Load", 
    "Nebulosity", "Nebulosity_weighted", "Id"
]
# On ne garde que les colonnes qui existent vraiment dans le DF
cols_to_drop = [c for c in df.columns if c in cols_to_drop]

X_train = df.drop(columns=cols_to_drop)
y_train = df["Net_demand"]


X_test = Data_test.drop(columns=["Date", "Usage", "Id"])

# One-hot encode WeekDays (drops Monday=0 as reference category)
X_train = pd.get_dummies(X_train, columns=['WeekDays'], prefix='WeekDays', drop_first=True, dtype=float)
X_test = pd.get_dummies(X_test, columns=['WeekDays'], prefix='WeekDays', drop_first=True, dtype=float)

# Ensure test has same columns as train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0.0)

y_train_np = y_train.values

In [11]:
X_train

Unnamed: 0,Load.1,Load.7,Temp,Temp_s95,Temp_s99,Temp_s95_min,Temp_s95_max,Temp_s99_min,Temp_s99_max,Wind,...,Heating_Flag,Cooling_Flag,Heating_Degree,Cooling_Degree,WeekDays_1,WeekDays_2,WeekDays_3,WeekDays_4,WeekDays_5,WeekDays_6
0,76353.208333,78166.125000,276.243539,276.528356,275.983263,276.143112,276.914295,275.875080,276.116254,3.591094,...,1.0,0.0,11.906461,0.000000,0.0,0.0,0.0,0.0,1.0,0.0
1,69902.979167,75368.020833,276.945418,276.480771,276.092675,275.053924,278.136641,275.755785,276.505025,3.061055,...,1.0,0.0,11.204582,0.000000,0.0,0.0,0.0,0.0,0.0,1.0
2,64929.250000,80191.604167,280.044604,278.478491,276.868222,275.767389,281.543954,276.169372,278.000776,3.973550,...,1.0,0.0,8.105396,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,69275.437500,79539.187500,282.979150,282.078125,278.864944,280.533954,283.842078,278.025304,279.995542,5.167031,...,1.0,0.0,5.170850,0.000000,1.0,0.0,0.0,0.0,0.0,0.0
4,66720.000000,78255.416667,283.428551,283.405606,280.646612,282.617220,284.141240,280.019945,281.344361,3.575167,...,1.0,0.0,4.721449,0.000000,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466,39762.375000,36820.104167,294.952582,294.582449,294.960018,292.424136,296.712824,294.530955,295.348607,3.565889,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.0
3467,37743.625000,42415.895833,295.949739,295.587284,295.177006,293.374627,297.720791,294.680368,295.693139,3.564721,...,0.0,1.0,0.000000,0.799739,0.0,0.0,0.0,0.0,0.0,0.0
3468,44106.250000,44445.312500,295.280122,295.672056,295.429284,294.182596,296.959805,295.104115,295.715848,3.084633,...,0.0,1.0,0.000000,0.130122,1.0,0.0,0.0,0.0,0.0,0.0
3469,45844.604167,45281.604167,294.679722,294.850625,295.212269,293.267581,296.218931,294.902646,295.509730,3.424414,...,0.0,0.0,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
scale_cols = find_continuous_columns(X_train)
# Feature-aware normalization (train stats reused for test)
X_train, X_mean, X_std = normalize(X_train, scale_cols=scale_cols)
X_test, _, _ = normalize(X_test, scale_cols=scale_cols)

# Convert to numpy for the model (training logic is unchanged)
X_train_np = X_train.values
X_test_np = X_test.values

In [13]:
X_train_np

array([[ 2.15389225,  2.30960997, -1.61167678, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.54913674,  2.04823576, -1.49751429, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.08281377,  2.49881236, -0.99342359, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.86949265, -0.84029084,  1.48467298, ...,  0.        ,
         0.        ,  0.        ],
       [-0.70650941, -0.76217185,  1.38701631, ...,  0.        ,
         0.        ,  0.        ],
       [-0.74207856, -0.77268645,  1.43636182, ...,  0.        ,
         0.        ,  0.        ]])

In [14]:
model = LinearModel(learning_rate=0.02, maxIter=10000)
model.fit(X_train_np, y_train_np, loss="rmse")
y_test_pred = model.predict(X_test_np)

In [15]:
y_test_pred

array([52585.28870829, 47006.50262459, 43514.28197521, 50565.63225336,
       52215.52603362, 52232.4257957 , 50756.41450815, 47883.04424673,
       44042.67578407, 43384.69704865, 50064.14884583, 52105.14580019,
       52900.34533601, 52569.81279398, 49650.99386705, 43339.50574933,
       40655.6303247 , 48622.9799646 , 50764.18204974, 51098.42854757,
       51700.12698046, 51794.25741128, 46905.93713864, 43996.97688544,
       48037.24480145, 47942.10605296, 50045.67913663, 52901.62580546,
       50881.01468931, 42951.47916306, 42903.53856339, 51478.94753166,
       52948.67523146, 50299.65198473, 51962.68889004, 51873.88473563,
       46935.53933176, 43949.92272557, 50777.86830732, 53106.32651085,
       53173.72290261, 53181.89838137, 51668.69358003, 45241.80227915,
       42644.02242651, 50870.17867137, 51708.50768023, 50269.28854682,
       50290.04892037, 49402.63199743, 44753.90715625, 40507.90014637,
       46116.08201441, 49269.83273691, 50400.74546882, 49537.46152558,
      

In [16]:
# save csv in ../Results/naive_linear_submission.csv
# submit_predictions(Data_test["Id"], y_test_pred, filename="all_params_linear_rmse_submission")


In [160]:
q = 0.8  # quantile for pinball loss
model = LinearModel(learning_rate=0.02, maxIter=17000, tau=q)
model.fit(X_train_np, y_train_np, loss="pinball", verbose=True, log_every=500)
y_test_pred = model.predict(X_test_np)
y_test_pred

[pinball] iter=0 loss=39289.7398 reg(l1)=0.0000 obj=39289.7398 frac(y_hat>=y)=0.000
[pinball] iter=500 loss=39276.6032 reg(l1)=0.0000 obj=39276.6032 frac(y_hat>=y)=0.000
[pinball] iter=1000 loss=39263.4667 reg(l1)=0.0000 obj=39263.4667 frac(y_hat>=y)=0.000
[pinball] iter=1500 loss=39250.3302 reg(l1)=0.0000 obj=39250.3302 frac(y_hat>=y)=0.000
[pinball] iter=2000 loss=39237.1937 reg(l1)=0.0000 obj=39237.1937 frac(y_hat>=y)=0.000
[pinball] iter=2500 loss=39224.0572 reg(l1)=0.0000 obj=39224.0572 frac(y_hat>=y)=0.000
[pinball] iter=3000 loss=39210.9207 reg(l1)=0.0000 obj=39210.9207 frac(y_hat>=y)=0.000
[pinball] iter=3500 loss=39197.7842 reg(l1)=0.0000 obj=39197.7842 frac(y_hat>=y)=0.000
[pinball] iter=4000 loss=39184.6477 reg(l1)=0.0000 obj=39184.6477 frac(y_hat>=y)=0.000
[pinball] iter=4500 loss=39171.5112 reg(l1)=0.0000 obj=39171.5112 frac(y_hat>=y)=0.000
[pinball] iter=5000 loss=39158.3747 reg(l1)=0.0000 obj=39158.3747 frac(y_hat>=y)=0.000
[pinball] iter=5500 loss=39145.2382 reg(l1)=0.0

array([473.551138  , 473.62950158, 473.62950158, 434.76116393,
       473.62950158, 473.62950158, 473.62950158, 473.551138  ,
       473.62950158, 473.62950158, 434.76116393, 473.62950158,
       473.62950158, 473.62950158, 473.551138  , 473.62950158,
       473.62950158, 434.76116393, 473.62950158, 473.62950158,
       473.62950158, 473.551138  , 473.62950158, 473.62950158,
       434.76116393, 473.62950158, 473.62950158, 473.62950158,
       473.551138  , 473.62950158, 473.62950158, 434.76116393,
       473.62950158, 473.62950158, 473.62950158, 473.551138  ,
       473.62950158, 473.62950158, 434.76116393, 473.62950158,
       473.62950158, 473.62950158, 473.551138  , 473.62950158,
       473.62950158, 434.76116393, 473.62950158, 473.62950158,
       473.62950158, 473.551138  , 871.48141746, 871.48141746,
       832.6130798 , 871.48141746, 871.48141746, 871.48141746,
       871.40305388, 871.48141746, 708.72025353, 678.15845578,
       725.33333333, 708.72025353, 708.72025353, 708.64

In [161]:
# Feature relevance (no retraining): rank coefficients by |value|

feature_names = list(X_train.columns)
coefs = np.asarray(model.weights).reshape(-1)
assert len(feature_names) == len(coefs), (len(feature_names), len(coefs))

# Binary / one-hot columns are exactly 0/1 after our feature-aware normalization
is_binary = []
for c in feature_names:
    vals = pd.unique(X_train[c].dropna())
    is_binary.append(len(vals) <= 2 and set(vals).issubset({0, 1}))

rank_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs,
    "abs_coef": np.abs(coefs),
    "type": np.where(is_binary, "binary/one-hot", "continuous (normalized)")
}).sort_values("abs_coef", ascending=False)

rank_df.head(40)

Unnamed: 0,feature,coef,abs_coef,type
19,DLS,162.7612,162.7612,binary/one-hot
22,Holiday,115.5079,115.5079,binary/one-hot
25,Holiday_zone_c,94.66321,94.66321,binary/one-hot
23,Holiday_zone_a,94.11466,94.11466,binary/one-hot
24,Holiday_zone_b,93.56612,93.56612,binary/one-hot
20,Summer_break,45.52924,45.52924,binary/one-hot
37,WeekDays_5,38.86834,38.86834,binary/one-hot
35,WeekDays_3,38.86834,38.86834,binary/one-hot
34,WeekDays_2,38.86834,38.86834,binary/one-hot
33,WeekDays_1,38.86834,38.86834,binary/one-hot


In [162]:
submit_predictions(Data_test["Id"], y_test_pred, filename="all_params_linear_pinball_submission")

File all_params_linear_pinball_submission.csv already exists. Overwriting...
Submission file all_params_linear_pinball_submission.csv created.


# Parameters removed or added

In [140]:
# submit_predictions(Data_test["Id"], y_test_pred, filename="month_dropped_linear_pinball_submission")

In [141]:
# Automatic remove and show stats
def retrain_after_dropping(
    X_train,
    X_test,
    y_train_np,
    drop_features,
    q,
    learning_rate=0.02,
    maxIter=17000,
    verbose=True
):
    # Drop features
    X_train_red = X_train.drop(columns=drop_features)
    X_test_red = X_test.drop(columns=drop_features)

    # Convert to numpy
    X_train_np = X_train_red.values
    X_test_np = X_test_red.values

    # Train model
    model = LinearModel(
        learning_rate=learning_rate,
        maxIter=maxIter,
        tau=q
    )
    model.fit(
        X_train_np,
        y_train_np,
        loss="pinball",
        verbose=verbose,
        log_every=500
    )

    # Predictions (optional but useful)
    y_test_pred = model.predict(X_test_np)

    # Coefficient analysis
    feature_names = list(X_train_red.columns)
    coefs = np.asarray(model.weights).reshape(-1)

    is_binary = []
    for c in feature_names:
        vals = pd.unique(X_train_red[c].dropna())
        is_binary.append(len(vals) <= 2 and set(vals).issubset({0, 1}))

    rank_df = pd.DataFrame({
        "feature": feature_names,
        "coef": coefs,
        "abs_coef": np.abs(coefs),
        "type": np.where(is_binary, "binary/one-hot", "continuous (normalized)")
    }).sort_values("abs_coef", ascending=False)

    return model, rank_df, y_test_pred


In [142]:
model_no_wind7, rank_no_wind7, y_pred_no_wind7 = retrain_after_dropping(
    X_train, X_test, y_train_np,
    drop_features=["Wind_power.7"],
    q=q
)

print("\nTop coefficients (Wind_power.7):")
print(rank_no_wind7.head(40))


[pinball] iter=0 loss=39289.7398 reg(l1)=0.0000 obj=39289.7398 frac(y_hat>=y)=0.000
[pinball] iter=500 loss=39276.6032 reg(l1)=0.0000 obj=39276.6032 frac(y_hat>=y)=0.000
[pinball] iter=1000 loss=39263.4667 reg(l1)=0.0000 obj=39263.4667 frac(y_hat>=y)=0.000
[pinball] iter=1500 loss=39250.3302 reg(l1)=0.0000 obj=39250.3302 frac(y_hat>=y)=0.000
[pinball] iter=2000 loss=39237.1937 reg(l1)=0.0000 obj=39237.1937 frac(y_hat>=y)=0.000
[pinball] iter=2500 loss=39224.0572 reg(l1)=0.0000 obj=39224.0572 frac(y_hat>=y)=0.000
[pinball] iter=3000 loss=39210.9207 reg(l1)=0.0000 obj=39210.9207 frac(y_hat>=y)=0.000
[pinball] iter=3500 loss=39197.7842 reg(l1)=0.0000 obj=39197.7842 frac(y_hat>=y)=0.000
[pinball] iter=4000 loss=39184.6477 reg(l1)=0.0000 obj=39184.6477 frac(y_hat>=y)=0.000
[pinball] iter=4500 loss=39171.5112 reg(l1)=0.0000 obj=39171.5112 frac(y_hat>=y)=0.000
[pinball] iter=5000 loss=39158.3747 reg(l1)=0.0000 obj=39158.3747 frac(y_hat>=y)=0.000
[pinball] iter=5500 loss=39145.2382 reg(l1)=0.0

In [143]:
model_no_holyb, rank_no_holyb, y_pred_no_holyb = retrain_after_dropping(
    X_train, X_test, y_train_np,
    drop_features=["Holiday_zone_b", "Wind_power.7"],
    q=q
)

print("\nTop coefficients (Holiday_zone_b + Wind_power.7 removed):")
print(rank_no_holyb.head(40))


[pinball] iter=0 loss=39289.7398 reg(l1)=0.0000 obj=39289.7398 frac(y_hat>=y)=0.000
[pinball] iter=500 loss=39277.3606 reg(l1)=0.0000 obj=39277.3606 frac(y_hat>=y)=0.000
[pinball] iter=1000 loss=39264.9814 reg(l1)=0.0000 obj=39264.9814 frac(y_hat>=y)=0.000
[pinball] iter=1500 loss=39252.6022 reg(l1)=0.0000 obj=39252.6022 frac(y_hat>=y)=0.000
[pinball] iter=2000 loss=39240.2230 reg(l1)=0.0000 obj=39240.2230 frac(y_hat>=y)=0.000
[pinball] iter=2500 loss=39227.8438 reg(l1)=0.0000 obj=39227.8438 frac(y_hat>=y)=0.000
[pinball] iter=3000 loss=39215.4646 reg(l1)=0.0000 obj=39215.4646 frac(y_hat>=y)=0.000
[pinball] iter=3500 loss=39203.0855 reg(l1)=0.0000 obj=39203.0855 frac(y_hat>=y)=0.000
[pinball] iter=4000 loss=39190.7063 reg(l1)=0.0000 obj=39190.7063 frac(y_hat>=y)=0.000
[pinball] iter=4500 loss=39178.3271 reg(l1)=0.0000 obj=39178.3271 frac(y_hat>=y)=0.000
[pinball] iter=5000 loss=39165.9479 reg(l1)=0.0000 obj=39165.9479 frac(y_hat>=y)=0.000
[pinball] iter=5500 loss=39153.5687 reg(l1)=0.0

In [144]:
submit_predictions(Data_test["Id"], y_test_pred, filename="no_holyb_wind7_linear_pinball_submission")

File no_holyb_wind7_linear_pinball_submission.csv already exists. Overwriting...
Submission file no_holyb_wind7_linear_pinball_submission.csv created.


# DENORMALIZE AFTER PREDICTION