# Project Notebook
Generated from script.

In [None]:
import sys
import os
from pathlib import Path

# Add project root to sys.path
# Assuming notebook is in 'notebooks/' and project root is one level up
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))
    
from pokemon_predictor import config
from pokemon_predictor.data.utils import load_data
import pandas as pd
import numpy as np
import joblib


# Feature Extraction Pipeline
    
This notebook orchestrates the extraction of features from the raw image dataset.
It uses the functions defined in `pokemon_predictor.features`.

**Inputs**: 
*   `data/raw/*.png`
*   `data/processed/pokemon_metadata.csv`

**Outputs**:
*   `data/processed/X_kmeans.csv` (RGB Dominant Colors)
*   `data/processed/X_hist.csv` (3D Color Histograms)
*   `data/processed/y_labels.csv` (Aligned Metadata)


In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from pokemon_predictor import config
from pokemon_predictor.features import extract_kmeans_features, extract_histogram_features

# 1. Load Metadata
metadata_path = config.PROCESSED_DATA_DIR / "pokemon_metadata.csv"
if not metadata_path.exists():
    print(f"Error: Metadata file not found at {metadata_path}. Run data loader notebook first.")
else:
    df_meta = pd.read_csv(metadata_path)
    df_meta = df_meta.drop_duplicates(subset=['id'])
    print(f"Unique Pokemon to process: {len(df_meta)}")


In [None]:
def process_single_image(row):
    pid = row['id']
    name = row['name']
    img_path = config.RAW_DATA_DIR / f"{name}.png"
    
    if not img_path.exists():
        return pid, None, None, False
        
    # Extract RGB K-Means (k=5)
    kmeans_feats = extract_kmeans_features(str(img_path), k=5)
    
    # Extract Histograms
    hist_feats = extract_histogram_features(str(img_path))
    
    if kmeans_feats is None or hist_feats is None:
        return pid, None, None, False
        
    return pid, kmeans_feats, hist_feats, True


In [None]:
# 2. Run Extraction (Parallel)
X_kmeans_list = []
X_hist_list = []
y_labels_list = []

print("Starting extraction...")
rows = [row for _, row in df_meta.iterrows()]

with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
    futures = {executor.submit(process_single_image, row): row for row in rows}
    
    for future in tqdm(as_completed(futures), total=len(rows), desc="Processing"):
        row = futures[future]
        try:
            pid, k_feats, h_feats, success = future.result()
            if success:
                X_kmeans_list.append(k_feats)
                X_hist_list.append(h_feats)
                y_labels_list.append(row)
        except Exception as e:
            print(f"Error on {row['name']}: {e}")
            
print(f"Successfully processed {len(X_kmeans_list)} images.")


In [None]:
# 3. Save to CSV
columns_kmeans = []
for i in range(5):
    columns_kmeans.extend([f'R{i+1}', f'G{i+1}', f'B{i+1}', f'P{i+1}'])

df_X_kmeans = pd.DataFrame(X_kmeans_list, columns=columns_kmeans)
df_X_hist = pd.DataFrame(X_hist_list)
df_y = pd.DataFrame(y_labels_list)

os.makedirs(config.PROCESSED_DATA_DIR, exist_ok=True)
df_X_kmeans.to_csv(config.PROCESSED_DATA_DIR / "X_kmeans.csv", index=False)
df_X_hist.to_csv(config.PROCESSED_DATA_DIR / "X_hist.csv", index=False)
df_y.to_csv(config.PROCESSED_DATA_DIR / "y_labels.csv", index=False)

print("Saved processed datasets.")
