In [1]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score


In [7]:
BASE = "/content/ecopackai"

X_PATH = f"{BASE}/X_raw.csv"
Y_PATH = f"{BASE}/y_raw.csv"

META_PATH = f"{BASE}/split_metadata.json"
METRICS_DIR = f"{BASE}/ml/metrics"
DOCS_DIR = f"{BASE}/docs"

os.makedirs(METRICS_DIR, exist_ok=True)
os.makedirs(DOCS_DIR, exist_ok=True)


In [8]:
X = pd.read_csv(X_PATH)
y = pd.read_csv(Y_PATH)

with open(META_PATH) as f:
    split_meta = json.load(f)

X.head(), y.head()


(     Material Type  Biodegradation Time (days) Recyclability Category
 0        Cardboard                       188.0                   High
 1  Paper/Bio-Based                        65.0                   High
 2            Steel                    180208.0                   High
 3  Paper/Bio-Based                        65.0                   High
 4  Paper/Bio-Based                       122.0                   High,
   recommended_material  sustainability_score cost_efficiency_category
 0            Cardboard                 78.48                High-cost
 1      Paper/Bio-Based                 85.23                High-cost
 2                Steel                 13.00                 Low-cost
 3      Paper/Bio-Based                 85.40                High-cost
 4      Paper/Bio-Based                 83.60                High-cost)

In [9]:
COST_TARGET = "sustainability_score"
CO2_TARGET = "sustainability_score"  # replace if you have a separate CO2 target


In [10]:
models = {
    "Linear_Regression": LinearRegression(),
    "Decision_Tree": DecisionTreeRegressor(random_state=42)
}


In [11]:
from sklearn.model_selection import StratifiedKFold, KFold

N_FOLDS = split_meta["cross_validation"]["n_folds"]
RANDOM_SEED = split_meta["cross_validation"]["random_seed"]

cv = KFold(
    n_splits=N_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)


In [14]:
results = []

# Preprocess X by one-hot encoding categorical features
X_encoded = pd.get_dummies(X, columns=['Material Type', 'Recyclability Category'], drop_first=True) # drop_first to avoid multicollinearity

# Impute missing values in X_encoded.fillna with the mean of each column
X_encoded = X_encoded.fillna(X_encoded.mean())

for model_name, model in models.items():
    # Only iterate over numerical targets for regression models
    # 'recommended_material' and 'cost_efficiency_category' are categorical strings and caused the error.
    # 'sustainability_score' is the only numerical target suitable for these regression models.
    for target in ['sustainability_score']:
        y_target = y[target]

        mae = -cross_val_score(
            model, X_encoded, y_target, # Use X_encoded
            scoring="neg_mean_absolute_error",
            cv=cv
        ).mean()

        rmse = np.sqrt(-cross_val_score(
            model, X_encoded, y_target, # Use X_encoded
            scoring="neg_mean_squared_error",
            cv=cv
        ).mean())

        r2 = cross_val_score(
            model, X_encoded, y_target, # Use X_encoded
            scoring="r2",
            cv=cv
        ).mean()

        results.append({
            "model": model_name,
            "target": target,
            "MAE": round(mae, 3),
            "RMSE": round(rmse, 3),
            "R2": round(r2, 3)
        })

In [15]:
metrics_df = pd.DataFrame(results)

metrics_path = f"{METRICS_DIR}/baseline_metrics.csv"
metrics_df.to_csv(metrics_path, index=False)

metrics_df


Unnamed: 0,model,target,MAE,RMSE,R2
0,Linear_Regression,sustainability_score,1.083,2.638,0.984
1,Decision_Tree,sustainability_score,0.903,2.626,0.984


In [16]:
summary_md = """
# Baseline Model Summary

## Models Used
1. Linear Regression
   - Simple, interpretable baseline
   - Assumes linear relationships

2. Decision Tree Regressor
   - Captures non-linear interactions
   - No hyperparameter tuning applied

## Purpose
These models establish baseline performance
for cost and CO₂ impact prediction.
"""

with open(f"{DOCS_DIR}/baseline_model_summary.md", "w") as f:
    f.write(summary_md)


In [17]:
eval_md = f"""
# Baseline Evaluation Report

## Metrics Used
- Mean Absolute Error (MAE)
- Root Mean Squared Error (RMSE)
- R² Score

## Cross-Validation
- {N_FOLDS}-Fold K-Fold Cross-Validation
- Fixed random seed: {RANDOM_SEED}

## Observations
- Linear Regression provides a simple reference
- Decision Tree captures non-linearity but may overfit
- Results serve as benchmarks for future models
"""

with open(f"{DOCS_DIR}/baseline_evaluation_report.md", "w") as f:
    f.write(eval_md)
