# 1. Data Loader

This notebook handles the data acquisition from the [PokéAPI](https://pokeapi.co/).

**Objectives:**
1. Fetch Pokémon metadata (ID, Name, Types) for Generation 1-3.
2. Download official artwork images for each Pokémon.
3. Save the metadata to a CSV file and images to the `data/raw` directory.

In [None]:
import requests
import pandas as pd
import os
import time
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Constants
DATA_DIR = "../data"
RAW_IMG_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
POKEAPI_URL = "https://pokeapi.co/api/v2/pokemon"
GEN1_COUNT = 151
GEN2_COUNT = 251
GEN3_COUNT = 386 # Total up to Gen 3
LIMIT = GEN3_COUNT

os.makedirs(RAW_IMG_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

## 1. Fetch Pokémon Metadata

In [None]:
def fetch_pokemon_data(limit=LIMIT):
    data = []
    print(f"Fetching data for first {limit} Pokemon...")
    
    # We can fetch a list first for efficiency, but need details for types/images
    # To be polite to the API, we'll fetch individually or use the listing endpoint and then detail.
    # Actually, the listing endpoint gives us the URL to fetch details.
    
    response = requests.get(f"{POKEAPI_URL}?limit={limit}")
    results = response.json()['results']
    
    for i, item in enumerate(tqdm(results)):
        r = requests.get(item['url'])
        if r.status_code != 200:
            print(f"Failed to fetch {item['name']}")
            continue
            
        details = r.json()
        
        # Extract types
        types = [t['type']['name'] for t in details['types']]
        type1 = types[0]
        type2 = types[1] if len(types) > 1 else None
        
        # Extract Image URL (Official Artwork)
        img_url = details['sprites']['other']['official-artwork']['front_default']
        
        data.append({
            'id': details['id'],
            'name': details['name'],
            'type1': type1,
            'type2': type2,
            'image_url': img_url
        })
        
        # Be nice to the API
        # time.sleep(0.05)
        
    return pd.DataFrame(data)

df_pokemon = fetch_pokemon_data()

In [None]:
print(df_pokemon.head())
print(f"Total Pokemon fetched: {len(df_pokemon)}")

## 2. Download Images

In [None]:
def download_image(row):
    if not row['image_url']:
        return
        
    file_path = os.path.join(RAW_IMG_DIR, f"{row['name']}.png")
    if os.path.exists(file_path):
        return # Skip if already exists
    
    try:
        img_data = requests.get(row['image_url']).content
        with open(file_path, 'wb') as f:
            f.write(img_data)
    except Exception as e:
        print(f"Error downloading {row['name']}: {e}")

print("Downloading images...")
# Using ThreadPool for faster downloads
with ThreadPoolExecutor(max_workers=10) as executor:
    list(tqdm(executor.map(download_image, [row for _, row in df_pokemon.iterrows()]), total=len(df_pokemon)))

print("Download complete.")

## 3. Save Metadata

In [None]:
csv_path = os.path.join(PROCESSED_DIR, "pokemon_metadata.csv")
df_pokemon.to_csv(csv_path, index=False)
print(f"Metadata saved to {csv_path}")