In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load full metadata CSV
df = pd.read_csv("filtered_metadata_with_images.csv")

# Step 2: Drop rows with missing 'product_type'
df = df.dropna(subset=["product_type"])

# Step 3: Remove rare classes (with fewer than 2 entries)
class_counts = df["product_type"].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df = df[df["product_type"].isin(valid_classes)]

# Optional: Show how many were removed
print(f"⚠️ Removed {len(class_counts) - len(valid_classes)} rare product_type classes.")

# Step 4: Perform stratified sampling of 20,000 rows
_, sampled_df = train_test_split(
    df,
    test_size=20000,
    stratify=df["product_type"],
    random_state=42
)

# Step 5: Save the sampled data
sampled_df.to_csv("sampled_metadata_stratified.csv", index=False)
print("✅ Sampled 20,000 rows and saved to 'sampled_metadata_stratified.csv'")


⚠️ Removed 59 rare product_type classes.
✅ Sampled 20,000 rows and saved to 'sampled_metadata_stratified.csv'
