In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import json

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score


In [15]:
BASE = "/content/ecopackai"

X_PATH = f"{BASE}/X_raw.csv"
Y_PATH = f"{BASE}/y_raw.csv"
PIPELINE_PATH = f"{BASE}/ml/models/preprocessing_pipeline.pkl"

META_PATH = f"{BASE}/ml/split_metadata.json"

MODELS_DIR = f"{BASE}/ml/models"
METRICS_DIR = f"{BASE}/ml/metrics"
DOCS_DIR = f"{BASE}/docs"

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(METRICS_DIR, exist_ok=True)
os.makedirs(DOCS_DIR, exist_ok=True)


In [8]:
X_raw = pd.read_csv(X_PATH)
y = pd.read_csv(Y_PATH)

with open(META_PATH) as f:
    split_meta = json.load(f)

X_raw.shape, y.shape


((404, 3), (404, 3))

In [9]:
COST_TARGET = "sustainability_score"  # üîÅ replace with cost target if different
y_cost = y[COST_TARGET]


In [10]:
leakage_cols = [c for c in X_raw.columns if "cost" in c.lower()]
leakage_cols


[]

In [11]:
X_raw = X_raw.drop(columns=leakage_cols, errors="ignore")


In [16]:
preprocessor = joblib.load(PIPELINE_PATH)

X_processed = preprocessor.transform(X_raw)
X_processed.shape


(404, 7)

In [17]:
from sklearn.model_selection import train_test_split

TEST_SIZE = split_meta["train_test_split"]["test_ratio"]
RANDOM_SEED = split_meta["train_test_split"]["random_seed"]

X_train, X_test, y_train, y_test = train_test_split(
    X_processed,
    y_cost,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED
)

X_train.shape, X_test.shape


((323, 7), (81, 7))

In [18]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

rf_model


In [19]:
rf_model.fit(X_train, y_train)


In [20]:
from sklearn.model_selection import KFold

N_FOLDS = split_meta["cross_validation"]["n_folds"]

cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)


In [21]:
mae_cv = -cross_val_score(rf_model, X_train, y_train,
                          scoring="neg_mean_absolute_error", cv=cv).mean()

rmse_cv = np.sqrt(-cross_val_score(rf_model, X_train, y_train,
                                   scoring="neg_mean_squared_error", cv=cv).mean())

r2_cv = cross_val_score(rf_model, X_train, y_train,
                        scoring="r2", cv=cv).mean()

mae_cv, rmse_cv, r2_cv


(np.float64(1.0043395534004915),
 np.float64(2.988839263100615),
 np.float64(0.9799764190147375))

In [22]:
y_pred = rf_model.predict(X_test)

mae_test = mean_absolute_error(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
r2_test = r2_score(y_test, y_pred)

mae_test, rmse_test, r2_test


(0.9443119723436523, np.float64(1.7373319406062362), 0.9916293789380108)

In [23]:
metrics_df = pd.DataFrame([{
    "model": "RandomForestRegressor",
    "target": COST_TARGET,
    "MAE_CV": round(mae_cv, 3),
    "RMSE_CV": round(rmse_cv, 3),
    "R2_CV": round(r2_cv, 3),
    "MAE_Test": round(mae_test, 3),
    "RMSE_Test": round(rmse_test, 3),
    "R2_Test": round(r2_test, 3)
}])

metrics_path = f"{METRICS_DIR}/rf_cost_metrics.csv"
metrics_df.to_csv(metrics_path, index=False)

metrics_df


Unnamed: 0,model,target,MAE_CV,RMSE_CV,R2_CV,MAE_Test,RMSE_Test,R2_Test
0,RandomForestRegressor,sustainability_score,1.004,2.989,0.98,0.944,1.737,0.992


In [24]:
model_path = f"{MODELS_DIR}/rf_cost.joblib"
joblib.dump(rf_model, model_path)

model_path


'/content/ecopackai/ml/models/rf_cost.joblib'

In [25]:
summary_md = f"""
# Random Forest Cost Model ‚Äî Training Summary

## Model
RandomForestRegressor

## Hyperparameters
- n_estimators: {rf_model.n_estimators}
- max_depth: {rf_model.max_depth}
- min_samples_split: {rf_model.min_samples_split}
- min_samples_leaf: {rf_model.min_samples_leaf}
- random_state: {RANDOM_SEED}

## Target
- {COST_TARGET} (Cost per unit, INR)

## Evaluation Metrics

### Cross-Validation ({N_FOLDS}-fold)
- MAE: {mae_cv:.3f}
- RMSE: {rmse_cv:.3f}
- R¬≤: {r2_cv:.3f}

### Test Set
- MAE: {mae_test:.3f}
- RMSE: {rmse_test:.3f}
- R¬≤: {r2_test:.3f}

## Notes
- Compared against baseline models from previous module.
- Observed improvements and limitations should guide future tuning.
"""

with open(f"{DOCS_DIR}/rf_cost_training_summary.md", "w") as f:
    f.write(summary_md)
