In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
from config import DATA_DIR
ml_file = PROJECT_ROOT / DATA_DIR / "processed" / "features_for_ml.csv"

In [5]:
ml_df = pd.read_csv(ml_file, parse_dates=["Date"])

In [6]:
ml_df.describe()

Unnamed: 0,Date,rolling_14d_adjusted,price_available,temperature_2m_max,temperature_2m_min,precipitation_sum,rain_sum,wind_speed_10m_max,et0_fao_evapotranspiration,temperature_2m_max_30d_lag,...,et0_fao_evapotranspiration_120d_lag,et0_fao_evapotranspiration_180d_lag,heatwave_flag,coldwave_flag,heavy_rain_flag,temp_missing,USD_TL,trend,trends_missing,is_trend_zero
count,8769,8769.0,8769.0,8769.0,8769.0,8769.0,8769.0,8769.0,8769.0,8769.0,...,8769.0,8769.0,8769.0,8769.0,8769.0,8769.0,8769.0,4776.0,4776.0,4776.0
mean,2012-08-28 23:45:23.092712960,2.829628,1.0,17.114084,10.531144,3.428692,3.099704,13.969997,2.386388,17.280467,...,2.483747,2.440992,0.000114,0.057247,0.000798,0.0,6.79419,66.270472,0.0,0.175042
min,1999-12-26 00:00:00,0.730755,1.0,-2.8,-10.8,0.0,0.0,4.3,0.18,3.85,...,1.011083,1.246722,0.0,0.0,0.0,0.0,0.535502,0.0,0.0,0.0
25%,2005-12-26 00:00:00,2.19207,1.0,12.0,5.4,0.0,0.0,10.7,1.27,12.293333,...,1.618333,1.742167,0.0,0.0,0.0,0.0,1.424333,27.807069,0.0,0.0
50%,2012-04-06 00:00:00,2.764137,1.0,17.6,10.4,0.3,0.3,13.0,2.16,16.973333,...,2.464417,2.4565,0.0,0.0,0.0,0.0,1.791467,49.184661,0.0,0.0
75%,2019-11-29 00:00:00,3.36321,1.0,22.8,16.3,4.0,3.5,16.2,3.42,22.84,...,3.339083,3.109167,0.0,0.0,0.0,0.0,5.953,83.319809,0.0,0.0
max,2025-12-31 00:00:00,7.769791,1.0,35.8,25.5,78.7,78.7,53.7,6.74,29.78,...,4.290583,3.861833,1.0,1.0,1.0,0.0,42.9395,1087.689728,0.0,1.0
std,,1.207779,0.0,6.963901,6.844228,6.422489,6.017589,4.789735,1.336889,6.138736,...,0.893603,0.705018,0.010679,0.232327,0.028244,0.0,10.445633,68.40189,0.0,0.380043


In [7]:
ml_df.columns

Index(['Date', 'rolling_14d_adjusted', 'price_available', 'temperature_2m_max',
       'temperature_2m_min', 'precipitation_sum', 'rain_sum',
       'wind_speed_10m_max', 'et0_fao_evapotranspiration',
       'temperature_2m_max_30d_lag', 'temperature_2m_max_60d_lag',
       'temperature_2m_max_90d_lag', 'temperature_2m_max_120d_lag',
       'temperature_2m_max_180d_lag', 'temperature_2m_min_30d_lag',
       'temperature_2m_min_60d_lag', 'temperature_2m_min_90d_lag',
       'temperature_2m_min_120d_lag', 'temperature_2m_min_180d_lag',
       'precipitation_sum_30d_lag', 'precipitation_sum_60d_lag',
       'precipitation_sum_90d_lag', 'precipitation_sum_120d_lag',
       'precipitation_sum_180d_lag', 'rain_sum_30d_lag', 'rain_sum_60d_lag',
       'rain_sum_90d_lag', 'rain_sum_120d_lag', 'rain_sum_180d_lag',
       'wind_speed_10m_max_30d_lag', 'wind_speed_10m_max_60d_lag',
       'wind_speed_10m_max_90d_lag', 'wind_speed_10m_max_120d_lag',
       'wind_speed_10m_max_180d_lag', 'et0_fao_e

In [8]:
target_col = 'rolling_14d_adjusted'

drop_cols = [
    'Date',
    target_col,
    'trend',
    'trends_missing',
    'is_trend_zero'
]

In [9]:
X = ml_df.drop(columns=drop_cols)
y = ml_df[target_col]

In [10]:
train_mask = ml_df['Date'] < '2019-01-01'

X_train = X.loc[train_mask]
y_train = y.loc[train_mask]

X_test = X.loc[~train_mask]
y_test = y.loc[~train_mask]

In [11]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6318, 46) (2451, 46) (6318,) (2451,)


In [15]:
ridge_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])

In [16]:
ridge_pipe.fit(X_train, y_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('scaler', ...), ('ridge', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"alpha  alpha: {float, ndarray of shape (n_targets,)}, default=1.0 Constant that multiplies the L2 term, controlling regularization strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. When `alpha = 0`, the objective is equivalent to ordinary least squares, solved by the :class:`LinearRegression` object. For numerical reasons, using `alpha = 0` with the `Ridge` object is not advised. Instead, you should use the :class:`LinearRegression` object. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number.",1.0
,"fit_intercept  fit_intercept: bool, default=True Whether to fit the intercept for this model. If set to false, no intercept will be used in calculations (i.e. ``X`` and ``y`` are expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"max_iter  max_iter: int, default=None Maximum number of iterations for conjugate gradient solver. For 'sparse_cg' and 'lsqr' solvers, the default value is determined by scipy.sparse.linalg. For 'sag' solver, the default value is 1000. For 'lbfgs' solver, the default value is 15000.",
,"tol  tol: float, default=1e-4 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for each solver: - 'svd': `tol` has no impact. - 'cholesky': `tol` has no impact. - 'sparse_cg': norm of residuals smaller than `tol`. - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,  which control the norm of the residual vector in terms of the norms of  matrix and coefficients. - 'sag' and 'saga': relative change of coef smaller than `tol`. - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals|  smaller than `tol`. .. versionchanged:: 1.2  Default value changed from 1e-3 to 1e-4 for consistency with other linear  models.",0.0001
,"solver  solver: {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto' Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge  coefficients. It is the most stable solver, in particular more stable  for singular matrices than 'cholesky' at the cost of being slower. - 'cholesky' uses the standard :func:`scipy.linalg.solve` function to  obtain a closed-form solution. - 'sparse_cg' uses the conjugate gradient solver as found in  :func:`scipy.sparse.linalg.cg`. As an iterative algorithm, this solver is  more appropriate than 'cholesky' for large-scale data  (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine  :func:`scipy.sparse.linalg.lsqr`. It is the fastest and uses an iterative  procedure. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses  its improved, unbiased version named SAGA. Both methods also use an  iterative procedure, and are often faster than other solvers when  both n_samples and n_features are large. Note that 'sag' and  'saga' fast convergence is only guaranteed on features with  approximately the same scale. You can preprocess the data with a  scaler from :mod:`sklearn.preprocessing`. - 'lbfgs' uses L-BFGS-B algorithm implemented in  :func:`scipy.optimize.minimize`. It can be used only when `positive`  is True. All solvers except 'svd' support both dense and sparse data. However, only 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept` is True. .. versionadded:: 0.17  Stochastic Average Gradient descent solver. .. versionadded:: 0.19  SAGA solver.",'auto'
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. Only 'lbfgs' solver is supported in this case.",False
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag' or 'saga' to shuffle the data. See :term:`Glossary ` for details. .. versionadded:: 0.17  `random_state` to support Stochastic Average Gradient.",


In [17]:
y_pred_ridge = ridge_pipe.predict(X_test)

In [18]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae = mean_absolute_error(y_test, y_pred_ridge)
r2 = r2_score(y_test, y_pred_ridge)

print("Ridge Performance")
print("RMSE:", rmse)
print("MAE:", mae)
print("R²:", r2)

Ridge Performance
RMSE: 9.210608078945292
MAE: 7.346587296557428
R²: -84.319683792233


In [19]:
coef = (
    pd.Series(
        ridge_pipe.named_steps['ridge'].coef_,
        index=X.columns
    )
    .sort_values()
)

coef

temperature_2m_min_180d_lag           -3.717747
temperature_2m_max_120d_lag           -1.349380
temperature_2m_max_60d_lag            -1.332526
temperature_2m_min_90d_lag            -1.296169
et0_fao_evapotranspiration_90d_lag    -0.548508
precipitation_sum_180d_lag            -0.519978
et0_fao_evapotranspiration_180d_lag   -0.404907
precipitation_sum_60d_lag             -0.374310
precipitation_sum_120d_lag            -0.279376
precipitation_sum_30d_lag             -0.244982
et0_fao_evapotranspiration_30d_lag    -0.184100
temperature_2m_max_30d_lag            -0.183395
temperature_2m_min                    -0.131322
season_development                    -0.065824
coldwave_flag                         -0.065150
precipitation_sum                     -0.038263
heavy_rain_flag                       -0.036389
season_harvest                        -0.024534
temperature_2m_max                    -0.021766
et0_fao_evapotranspiration            -0.011481
heatwave_flag                          0