In [4]:
import pandas as pd
import numpy as np
import os

BASE = "/content/ecopackai"
DATA_PATH = f"/content/materials_engineered.csv"
OUTPUT_PATH = f"{BASE}/data/model_input"

os.makedirs(OUTPUT_PATH, exist_ok=True)

In [5]:
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
df.head()


Shape: (404, 31)


Unnamed: 0,Material ID,Packaging Type,Material Type,Suitable Product Categories,Recommended Packaging Use Cases,Supplier Region,Recyclability (%),Recyclability Category,Recycled Content (%),Reusability (%),...,Total Material Weight (tons),Supplier Sustainability Compliance (%),Weight per Unit (g),Cost per kg (USD),CO2_Impact_Index_raw,CO2_Impact_Index,Cost_Efficiency_Index_raw,Cost_Efficiency_Index,Material_Suitability_Score_raw,Material_Suitability_Score
0,MAT_0001,Cardboard Boxes,Cardboard,"E-commerce, Food & Beverage, Consumer Goods, A...",Last-mile delivery and primary e-commerce pack...,EMEA,98,High,79.0,49.0,...,790.0,85.0,7801.698598,0.287117,0.215183,21.52,0.704681,70.47,0.440278,44.03
1,MAT_0002,Protective Fillers (Paper/Biodegradable),Paper/Bio-Based,"Fragile Items, Cosmetics, Pharmaceuticals, Int...",Void-fill and cushioning for fragile products,APAC,100,High,93.0,31.0,...,545.0,88.0,7368.748394,0.246989,0.14769,14.77,0.714471,71.45,0.253472,25.35
2,MAT_0003,Steel Racks & Containers,Steel,"Heavy Industrial Components, High-Security Goods","Secure, high-load international shipping and l...",AMERICAS,87,High,78.0,100.0,...,4979.0,88.0,339886.681685,0.073554,0.87,87.0,0.174145,17.41,0.774306,77.43
3,MAT_0004,Protective Fillers (Paper/Biodegradable),Paper/Bio-Based,"Fragile Items, Cosmetics, Pharmaceuticals, Int...",Void-fill and cushioning for fragile products,ROW,100,High,89.0,31.0,...,656.0,81.0,8742.470281,0.277953,0.146007,14.6,0.700252,70.03,0.1625,16.25
4,MAT_0005,Protective Fillers (Paper/Biodegradable),Paper/Bio-Based,"Fragile Items, Cosmetics, Pharmaceuticals, Int...",Void-fill and cushioning for fragile products,LATAM,100,High,92.0,28.0,...,549.0,81.0,7932.494329,0.213048,0.164028,16.4,0.717501,71.75,0.225694,22.57


In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 31 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Material ID                             404 non-null    object 
 1   Packaging Type                          404 non-null    object 
 2   Material Type                           404 non-null    object 
 3   Suitable Product Categories             404 non-null    object 
 4   Recommended Packaging Use Cases         404 non-null    object 
 5   Supplier Region                         404 non-null    object 
 6   Recyclability (%)                       404 non-null    int64  
 7   Recyclability Category                  404 non-null    object 
 8   Recycled Content (%)                    403 non-null    float64
 9   Reusability (%)                         403 non-null    float64
 10  Biodegradation Time (days)              403 non-null    float6

In [7]:
df.isna().sum()


Unnamed: 0,0
Material ID,0
Packaging Type,0
Material Type,0
Suitable Product Categories,0
Recommended Packaging Use Cases,0
Supplier Region,0
Recyclability (%),0
Recyclability Category,0
Recycled Content (%),1
Reusability (%),1


In [8]:
# If column already exists, rename for consistency
if "Material Type" in df.columns:
    df["recommended_material"] = df["Material Type"]
else:
    raise Exception("Material Type column missing")


In [9]:
df["sustainability_score"] = 100 - df["CO2_Impact_Index"]


In [10]:
def cost_bucket(x):
    if x < 33:
        return "Low-cost"
    elif x < 66:
        return "Medium-cost"
    else:
        return "High-cost"

df["cost_efficiency_category"] = df["Cost_Efficiency_Index"].apply(cost_bucket)


In [11]:
target_cols = [
    "recommended_material",
    "sustainability_score",
    "cost_efficiency_category"
]

df[target_cols].head()


Unnamed: 0,recommended_material,sustainability_score,cost_efficiency_category
0,Cardboard,78.48,High-cost
1,Paper/Bio-Based,85.23,High-cost
2,Steel,13.0,Low-cost
3,Paper/Bio-Based,85.4,High-cost
4,Paper/Bio-Based,83.6,High-cost


In [12]:
material_features = [
    "Material Type",
    "Density",
    "Strength Score",
    "Moisture_resistance",
    "Thermal_resistance",
    "Load_handling",
    "CO2_per_kg",
    "Biodegradation Time (days)",
    "Recyclability Category",
    "Renewable_Content_Percentage"
]

product_features = [
    "Product Category",
    "Fragility Score",
    "Weight Category",
    "Moisture Sensitivity",
    "Temperature Sensitivity",
    "Transportation Distance",
    "Expected Shelf Life (days)"
]

cost_features = [
    "Cost_per_kg",
    "Manufacturing Cost",
    "Supply Chain Availability",
    "Regional Restrictions"
]


In [13]:
feature_cols = material_features + product_features + cost_features

# Keep only existing columns
feature_cols = [c for c in feature_cols if c in df.columns]

print("Final Feature Count:", len(feature_cols))
feature_cols


Final Feature Count: 3


['Material Type', 'Biodegradation Time (days)', 'Recyclability Category']

In [14]:
drop_cols = [
    "MaterialID",
    "Material Description",
    "CO2_Impact_Index",
    "Cost_Efficiency_Index",
    "Material_Suitability_Score"
]

df_model = df.drop(columns=[c for c in drop_cols if c in df.columns])


In [15]:
X = df_model[feature_cols]
y = df_model[target_cols]


In [16]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X.head()


X shape: (404, 3)
y shape: (404, 3)


Unnamed: 0,Material Type,Biodegradation Time (days),Recyclability Category
0,Cardboard,188.0,High
1,Paper/Bio-Based,65.0,High
2,Steel,180208.0,High
3,Paper/Bio-Based,65.0,High
4,Paper/Bio-Based,122.0,High


In [17]:
X.to_csv(f"{OUTPUT_PATH}/X_raw.csv", index=False)


In [18]:
y.to_csv(f"{OUTPUT_PATH}/y_raw.csv", index=False)


In [21]:
import os

doc = """
## Features & Targets — Dataset Preparation (Week 4)

### Target Variables
- recommended_material (classification)
- sustainability_score (regression, 0–100)
- cost_efficiency_category (Low / Medium / High)

### Feature Groups
Material Properties:
- {}

Product Requirements:
- {}

Cost & Operations:
- {}
""".format(
    ", ".join(material_features),
    ", ".join(product_features),
    ", ".join(cost_features)
)

# Ensure the directory exists before writing the file
os.makedirs(f"{BASE}/docs", exist_ok=True)

with open(f"{BASE}/docs/data_dictionary_features_targets.md", "w") as f:
    f.write(doc)