# 2. Feature Extraction

This notebook processes the downloaded images to extract features for our two models:
1. **Feature A (K-Means):** Dominant colors (for XGBoost).
2. **Feature B (Histogram):** 3D Color Histogram (for MLP).

In [None]:
import cv2
import numpy as np
import pandas as pd
import os
import sys
from tqdm import tqdm

# Add src to path
sys.path.append("../src")
from features import extract_kmeans_features, extract_histogram_features

# Constants
DATA_DIR = "../data"
RAW_IMG_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
METADATA_PATH = os.path.join(PROCESSED_DIR, "pokemon_metadata.csv")

In [None]:
# Load Metadata
df = pd.read_csv(METADATA_PATH)
print(f"Loaded {len(df)} entries.")

## Run Extraction

In [None]:
kmeans_features = []
hist_features = []
valid_indices = []

print("Extracting features...")
for idx, row in tqdm(df.iterrows(), total=len(df)):
    img_path = os.path.join(RAW_IMG_DIR, f"{row['name']}.png")
    
    if not os.path.exists(img_path):
        continue
        
    feats_a = extract_kmeans_features(img_path)
    feats_b = extract_histogram_features(img_path)
    
    if feats_a is not None and feats_b is not None:
        kmeans_features.append(feats_a)
        hist_features.append(feats_b)
        valid_indices.append(idx)

# Create final DataFrames
X_kmeans = pd.DataFrame(kmeans_features)
X_hist = pd.DataFrame(hist_features)
y = df.loc[valid_indices, ['type1', 'type2']]

print(f"Processed {len(X_kmeans)} images successfully.")

## Save Datasets

In [None]:
X_kmeans.to_csv(os.path.join(PROCESSED_DIR, "X_kmeans.csv"), index=False)
X_hist.to_csv(os.path.join(PROCESSED_DIR, "X_hist.csv"), index=False)
y.to_csv(os.path.join(PROCESSED_DIR, "y_labels.csv"), index=False)

print("Datasets saved.")