In [46]:
from sklearn.linear_model import Lasso
import numpy as np
from pytorch_lightning import seed_everything
import torch
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [32]:
#pip install pytorch-lightning 

In [33]:
np.random.seed(0)

In [34]:
df = pd.read_csv('data/example_data.csv', delimiter = ',')

In [35]:
# Define categorical and numerical columns
categorical_cols = ['product_code', 'unit_of_measure', 'contract_type', 'buyer', 'supplier']
numerical_cols = ['quantity', 'total_value']
target_col = 'unit_price'  

# Use log_unit_price
df['log_unit_price'] = np.log(df[target_col])

# Preprocessing of columns
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])
X = preprocessor.fit_transform(df)
y = df['log_unit_price'].values

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=0)

# X_train: 70% of original
# X_val: 10% of original
# X_test: 20% of original

Lasso Regression

In [39]:
# Calculate lambda_max for Lasso regression
y_centered = y_train - y_train.mean()

if hasattr(X_train, 'T'):  # if sparse matrix
    lam_max = np.abs(X_train.T * y_centered).max()
else:  # if dense matrix
    lam_max = np.abs(X_train.T @ y_centered).max()

In [40]:
# Create a series of 50 values for the regularization parameter 
lam_val = lam_max * np.exp(np.linspace(np.log(1), np.log(1e-4), 50))

In [42]:
# Initialize Lasso regression
lasso = Lasso(alpha=lam_max,        
              warm_start=True,      
              fit_intercept=True,
              max_iter=1000)

In [43]:
coefs = []
intercepts = []

for l in lam_val:
    lasso.alpha = l
    lasso.fit(X_train, y_train)
    coefs.append(lasso.coef_.copy())
    intercepts.append(lasso.intercept_)

coefs = np.array(coefs)  
intercepts = np.array(intercepts) 

# Evaluate on validation set to find best lambda
val_errors = []
for i, l in enumerate(lam_val):
    lasso.alpha = l
    lasso.fit(X_train, y_train)
    val_pred = lasso.predict(X_val)
    val_mae = np.mean(np.abs(val_pred - y_val))
    val_errors.append(val_mae)

InvalidParameterError: The 'alpha' parameter of Lasso must be a float in the range [0.0, inf). Got np.float64(nan) instead.

In [None]:
# Best lambda
best_idx = np.argmin(val_errors)
best_lambda = lam_val[best_idx]
print(f"Best lambda: {best_lambda:.6f}")
print(f"Best validation MAE: {val_errors[best_idx]:.3f}")

In [None]:
# Train model with best lambda
lasso_final = Lasso(alpha=best_lambda, fit_intercept=True)
lasso_final.fit(X_train, y_train)

Evaluation

In [44]:
test_pred = lasso_final.predict(X_test)
test_mae = np.mean(np.abs(test_pred - y_test))
test_rmse = np.sqrt(np.mean((test_pred - y_test)**2))
test_r2 = 1 - np.sum((y_test - test_pred)**2) / np.sum((y_test - y_test.mean())**2)

print(f"Lasso Test MAE: {test_mae:.3f}")
print(f"Lasso Test RMSE: {test_rmse:.3f}")
print(f"Lasso Test R²: {test_r2:.3f}")

NameError: name 'lasso_final' is not defined

Find best alpha via Cross-validation

In [48]:
def lasso_cv_path(X, y, lam_val, k_folds=5):
    """
    Perform k-fold CV for each alpha in lam_val and return mean CV MAE.
    """
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=0)
    mae_vals, rmse_vals, r2_vals = [], [], []

    for alpha in lam_val:
        maes, rmses, r2s = [], [], []
        for train_idx, val_idx in kf.split(X):
            X_tr, X_val = X[train_idx], X[val_idx]
            y_tr, y_val = y[train_idx], y[val_idx]

            model = Lasso(alpha=alpha, fit_intercept=True, max_iter=10000)
            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_val)

            maes.append(mean_absolute_error(y_val, y_pred))
            rmses.append(np.sqrt(mean_squared_error(y_val, y_pred)))
            r2s.append(r2_score(y_val, y_pred))
        
        mae_vals.append(np.mean(maes))
        rmse_vals.append(np.mean(rmses))
        r2_vals.append(np.mean(r2s))

    return np.array(mae_vals), np.array(rmse_vals), np.array(r2_vals)

In [None]:
mae_vals, rmse_vals, r2_vals = lasso_cv_metrics(X_train, y_train, lam_val, k_folds=5)

# Find best alpha
best_idx    = np.argmin(mae_vals)
best_alpha  = lam_val[best_idx]
best_mae    = cv_mae[best_idx]
print(f"Best alpha: {best_alpha:.6e}, CV MAE: {best_mae:.3f}")

# Retrain with this
X_full = np.vstack([X_train, X_val])
y_full = np.concatenate([y_train, y_val])

lasso_final = Lasso(alpha=best_alpha, fit_intercept=True, max_iter=10000)
lasso_final.fit(X_full, y_full)

# Evaluate on test set
y_test_pred = lasso_final.predict(X_test)
test_mae  = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(np.mean((y_test - y_test_pred)**2))
test_r2   = 1 - np.sum((y_test - y_test_pred)**2) / np.sum((y_test - y_test.mean())**2)

print(f"Test MAE:  {test_mae:.3f}")
print(f"Test RMSE: {test_rmse:.3f}")
print(f"Test R²:   {test_r2:.3f}")

Crossvalidation error