In [1]:
import os
from datetime import datetime


In [2]:
BASE = "/content/ecopackai/ml"

EXPERIMENTS_DIR = f"{BASE}/experiments"
METADATA_DIR = f"{EXPERIMENTS_DIR}/metadata"
TRAINING_DIR = f"{BASE}/training"
MODELS_DIR = f"{BASE}/models"

os.makedirs(EXPERIMENTS_DIR, exist_ok=True)
os.makedirs(METADATA_DIR, exist_ok=True)
os.makedirs(TRAINING_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

EXPERIMENTS_DIR, METADATA_DIR, TRAINING_DIR, MODELS_DIR


('/content/ecopackai/ml/experiments',
 '/content/ecopackai/ml/experiments/metadata',
 '/content/ecopackai/ml/training',
 '/content/ecopackai/ml/models')

In [4]:
def generate_experiment_id(model, target, dataset_version="v1"):
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    return f"{model}__{target}__{dataset_version}__{ts}"


In [5]:
import json

def create_metadata(exp_id, model_name, target, dataset_version,
                    feature_version, params, metrics):
    return {
        "experiment_id": exp_id,
        "timestamp": datetime.now().isoformat(),
        "dataset_version": dataset_version,
        "feature_set_version": feature_version,
        "model_name": model_name,
        "model_parameters": params,
        "target_variable": target,
        "evaluation_metrics": metrics
    }


In [9]:
VERSION_LOG = "/content/ecopackai/docs/model_version_history.md"


In [12]:
os.makedirs(os.path.dirname(VERSION_LOG), exist_ok=True)
if not os.path.exists(VERSION_LOG):
    with open(VERSION_LOG, "w") as f:
        f.write("# Model Version History\n\n")

In [13]:
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np


In [14]:
def train_and_log(model, model_name, X_train, y_train, X_test, y_test,
                  target, dataset_version="v1", feature_version="v1",
                  model_version=1):

    # Train
    model.fit(X_train, y_train)

    # Predict
    preds = model.predict(X_test)

    # Metrics
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    metrics = {
        "MAE": round(mae, 3),
        "RMSE": round(rmse, 3),
        "R2": round(r2, 3)
    }

    # Experiment ID
    exp_id = generate_experiment_id(model_name, target, dataset_version)

    # Save model
    model_filename = f"{model_name}_{target}_v{model_version}.pkl"
    model_path = f"{MODELS_DIR}/{model_filename}"
    joblib.dump(model, model_path)

    # Save metadata
    metadata = create_metadata(
        exp_id, model_name, target, dataset_version,
        feature_version, model.get_params(), metrics
    )

    meta_path = f"{METADATA_DIR}/{exp_id}.json"
    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=2)

    # Update version history
    with open(VERSION_LOG, "a") as f:
        f.write(f"- {model_filename} | target={target} | metrics={metrics}\n")

    return model_path, meta_path, metrics


In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [18]:
# Load data
X = pd.read_csv("/content/ecopackai/ml/models/X_raw.csv")
y = pd.read_csv("/content/ecopackai/ml/models/y_raw.csv")

# Example: cost target (adjust if needed)
target_col = y.columns[0]

X_train, X_test, y_train, y_test = train_test_split(
    X, y[target_col], test_size=0.2, random_state=42
)


In [21]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Identify categorical and numerical columns in X
categorical_features_X = X_train.select_dtypes(include=['object']).columns
numerical_features_X = X_train.select_dtypes(include=['int64', 'float64']).columns

# Create a pipeline for numerical features (imputation)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Create a column transformer for one-hot encoding categorical features and imputing numerical features
preprocessor_X = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_X),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_X)
    ]
)

# Apply preprocessing to X_train and X_test
X_train_processed = preprocessor_X.fit_transform(X_train)
X_test_processed = preprocessor_X.transform(X_test)

# For y_train and y_test (recommended_material is categorical string)
# LinearRegression is not typically suitable for categorical targets.
# However, to make the code runnable, we will LabelEncode y.
# If the goal is classification, a classification model should be used instead.
label_encoder_y = LabelEncoder()
y_train_processed = label_encoder_y.fit_transform(y_train)
y_test_processed = label_encoder_y.transform(y_test)

lr = LinearRegression()

model_path, meta_path, metrics = train_and_log(
    model=lr,
    model_name="LinearRegression",
    X_train=X_train_processed,
    y_train=y_train_processed,
    X_test=X_test_processed,
    y_test=y_test_processed,
    target=target_col,
    model_version=1
)

model_path, meta_path, metrics

('/content/ecopackai/ml/models/LinearRegression_recommended_material_v1.pkl',
 '/content/ecopackai/ml/experiments/metadata/LinearRegression__recommended_material__v1__20251223_102943.json',
 {'MAE': 0.0, 'RMSE': np.float64(0.0), 'R2': 1.0})

In [22]:
os.listdir(METADATA_DIR)


['LinearRegression__recommended_material__v1__20251223_102943.json']

In [23]:
os.listdir(MODELS_DIR)


['LinearRegression_recommended_material_v1.pkl', 'y_raw.csv', 'X_raw.csv']

In [24]:
framework_doc = """
# Training & Experiment Framework

All experiments follow a standardized workflow:
1. Load dataset & features
2. Train model
3. Evaluate MAE, RMSE, R2
4. Save model artifact
5. Log metadata per run
6. Update model version history

This ensures reproducibility and comparability.
"""

with open(f"{TRAINING_DIR}/training_workflow.md", "w") as f:
    f.write(framework_doc)
