In [1]:
import pandas as pd
import numpy as np
import os
import json

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold


In [4]:
BASE = "/content/ecopackai"

DATA_INPUT = f"{BASE}/X_raw.csv"
TARGET_INPUT = f"{BASE}/y_raw.csv"

META_DIR = f"{BASE}/ml/metadata"
DOCS_DIR = f"{BASE}/docs"

os.makedirs(META_DIR, exist_ok=True)
os.makedirs(DOCS_DIR, exist_ok=True)


In [5]:
X = pd.read_csv(DATA_INPUT)
y = pd.read_csv(TARGET_INPUT)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (404, 3)
y shape: (404, 3)


In [6]:
X.head()


Unnamed: 0,Material Type,Biodegradation Time (days),Recyclability Category
0,Cardboard,188.0,High
1,Paper/Bio-Based,65.0,High
2,Steel,180208.0,High
3,Paper/Bio-Based,65.0,High
4,Paper/Bio-Based,122.0,High


In [7]:
y.head()


Unnamed: 0,recommended_material,sustainability_score,cost_efficiency_category
0,Cardboard,78.48,High-cost
1,Paper/Bio-Based,85.23,High-cost
2,Steel,13.0,Low-cost
3,Paper/Bio-Based,85.4,High-cost
4,Paper/Bio-Based,83.6,High-cost


In [8]:
TEST_SIZE = 0.2
RANDOM_SEED = 42


In [11]:
STRATIFY_COL = "Recyclability Category"

if STRATIFY_COL not in X.columns:
    raise Exception(f"Stratification column '{STRATIFY_COL}' not found")

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=X[STRATIFY_COL]
)


In [13]:
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (323, 3)
Test size: (81, 3)


In [14]:
assert set(X_train.index).isdisjoint(set(X_test.index))


In [15]:
y_train.describe()


Unnamed: 0,sustainability_score
count,323.0
mean,58.474706
std,20.32305
min,0.0
25%,45.075
50%,47.68
75%,79.13
max,87.83


In [16]:
y_test.describe()


Unnamed: 0,sustainability_score
count,81.0
mean,57.843951
std,20.174546
min,15.4
25%,44.87
50%,46.33
75%,78.46
max,85.23


In [17]:
X_train[STRATIFY_COL].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
Recyclability Category,Unnamed: 1_level_1
High,0.50774
Medium,0.49226


In [18]:
X_test[STRATIFY_COL].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
Recyclability Category,Unnamed: 1_level_1
High,0.506173
Medium,0.493827


In [19]:
N_FOLDS = 5

cv_strategy = StratifiedKFold(
    n_splits=N_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)


In [20]:
for fold, (train_idx, val_idx) in enumerate(
    cv_strategy.split(X_train, X_train[STRATIFY_COL])
):
    print(f"Fold {fold+1}:")
    print("  Train:", len(train_idx))
    print("  Val:", len(val_idx))


Fold 1:
  Train: 258
  Val: 65
Fold 2:
  Train: 258
  Val: 65
Fold 3:
  Train: 258
  Val: 65
Fold 4:
  Train: 259
  Val: 64
Fold 5:
  Train: 259
  Val: 64


In [21]:
split_metadata = {
    "dataset": "EcoPackAI Integrated Dataset",
    "dataset_version": "materials_engineered_v1",
    "train_test_split": {
        "train_ratio": 0.8,
        "test_ratio": 0.2,
        "stratify_on": STRATIFY_COL,
        "random_seed": RANDOM_SEED
    },
    "cross_validation": {
        "strategy": "StratifiedKFold",
        "n_folds": N_FOLDS,
        "shuffle": True,
        "random_seed": RANDOM_SEED
    },
    "features_used": list(X.columns),
    "targets_used": list(y.columns)
}


In [22]:
with open(f"{META_DIR}/split_metadata.json", "w") as f:
    json.dump(split_metadata, f, indent=2)


In [23]:
summary_md = f"""
# Train/Test Split Summary

## Split Strategy
- Train/Test Ratio: 80/20
- Stratification Column: {STRATIFY_COL}
- Random Seed: {RANDOM_SEED}

## Dataset Sizes
- Training Samples: {len(X_train)}
- Testing Samples: {len(X_test)}

## Target Variables
{list(y.columns)}

## Data Integrity
- No overlap between training and testing sets
- Category distribution preserved
"""


In [24]:
with open(f"{DOCS_DIR}/train_test_split_summary.md", "w") as f:
    f.write(summary_md)


In [25]:
cv_md = f"""
# Cross-Validation Strategy

## Method
Stratified K-Fold Cross-Validation

## Configuration
- Number of folds: {N_FOLDS}
- Shuffle: Enabled
- Random seed: {RANDOM_SEED}

## Rationale
Stratification ensures balanced distribution of product categories
across validation folds, preventing biased evaluation.
"""


In [26]:
with open(f"{DOCS_DIR}/cross_validation_strategy.md", "w") as f:
    f.write(cv_md)


In [27]:
repro_md = f"""
# Experiment Reproducibility Notes

## Fixed Parameters
- Random Seed: {RANDOM_SEED}
- Dataset Version: materials_engineered_v1

## Assumptions
- Product Category is representative of data distribution
- Engineered features are stable and validated

## Reproduction Steps
1. Load X_raw.csv and y_raw.csv
2. Apply same random seed
3. Use same stratification and CV configuration
"""


In [28]:
with open(f"{DOCS_DIR}/experiment_reproducibility.md", "w") as f:
    f.write(repro_md)
