# Data Loading

Goal of this Notebook:
- Load images into the `data/images` dir, with unique names and `.png` endings
- Load metadata into the `data/metadata`, with the name `images.json`

You can use this notebook to load the sample dataset, or load any other dataset you have, it must just fulfill the specifications above.

In [1]:
# install kaggle if a sample dataset should be loaded, otherwise add your own dataset directly
%pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.10-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.10
Note: you may need to restart the kernel to use updated packages.


In [2]:
# --- Imports ---
import os
import shutil
import json
from pathlib import Path

import pandas as pd
from PIL import Image

import kagglehub

# --- Path Definitions ---
# Base directories as per your Docker mount configuration
NOTEBOOK_DIR = Path(os.getcwd())
REPO_ROOT = NOTEBOOK_DIR.parent
DATASET_PATH = REPO_ROOT / "data"

# Image and metadata directories
IMAGE_DIR = DATASET_PATH / "images"
METADATA_PATH = DATASET_PATH / "metadata" / "images.json"

# Create directories if they don't exist
IMAGE_DIR.mkdir(parents=True, exist_ok=True)
METADATA_PATH.parent.mkdir(parents=True, exist_ok=True)

# --- Download Dataset ---
# Download dataset using kagglehub
kaggle_path = kagglehub.dataset_download("vishalsubbiah/pokemon-images-and-types")
print("Dataset downloaded to:", kaggle_path)

# --- Kaggle Paths ---
KAGGLE_IMAGES = Path(kaggle_path) / "images"
KAGGLE_CSV = Path(kaggle_path) / "pokemon.csv"

# Also define them as strings (if needed for other libraries)
KAGGLE_IMAGES_STR = os.path.join(kaggle_path, "images")
KAGGLE_CSV_STR = os.path.join(kaggle_path, "pokemon.csv")

# --- Log Paths ---
print("\n=== Paths ===")
print(f"Current working directory: {NOTEBOOK_DIR}")
print(f"Repository root: {REPO_ROOT}")
print(f"Dataset path: {DATASET_PATH}")
print(f"Image directory: {IMAGE_DIR}")
print(f"Metadata path: {METADATA_PATH}")

print("\n=== Loading Data ===")
print(f"Loading CSV from: {KAGGLE_CSV_STR}")
print(f"Loading images from: {KAGGLE_IMAGES_STR}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/vishalsubbiah/pokemon-images-and-types?dataset_version_number=4...


100%|██████████| 3.68M/3.68M [00:00<00:00, 5.51MB/s]

Extracting files...





Dataset downloaded to: /root/.cache/kagglehub/datasets/vishalsubbiah/pokemon-images-and-types/versions/4

=== Paths ===
Current working directory: /app/notebooks
Repository root: /app
Dataset path: /app/data
Image directory: /app/data/images
Metadata path: /app/data/metadata/images.json

=== Loading Data ===
Loading CSV from: /root/.cache/kagglehub/datasets/vishalsubbiah/pokemon-images-and-types/versions/4/pokemon.csv
Loading images from: /root/.cache/kagglehub/datasets/vishalsubbiah/pokemon-images-and-types/versions/4/images


In [3]:
# Load and process the CSV data
df = pd.read_csv(KAGGLE_CSV)
print(f"\nLoaded {len(df)} Pokemon from CSV")

# Clean the data and create a JSON structure
pokemon_data = {}
for _, row in df.iterrows():
    pokemon_data[row["Name"].lower()] = {
        "type1": row["Type1"],
        "type2": row["Type2"] if pd.notna(row["Type2"]) else None,
        "evolution": row["Evolution"] if pd.notna(row["Evolution"]) else None,
    }

# Save metadata as JSON
os.makedirs(os.path.dirname(METADATA_PATH), exist_ok=True)
with open(METADATA_PATH, "w") as f:
    json.dump(pokemon_data, f, indent=2)
print(f"Saved metadata to: {METADATA_PATH}")

# Copy images to our data directory
os.makedirs(IMAGE_DIR, exist_ok=True)
image_count = 0
for img_file in os.listdir(KAGGLE_IMAGES):
    if img_file.lower().endswith((".png", ".jpg", ".jpeg")):
        src = os.path.join(KAGGLE_IMAGES, img_file)
        dst = os.path.join(IMAGE_DIR, img_file.lower())
        shutil.copy2(src, dst)
        image_count += 1
print(f"Copied {image_count} images to: {IMAGE_DIR}")


Loaded 809 Pokemon from CSV
Saved metadata to: /app/data/metadata/images.json
Copied 809 images to: /app/data/images


In [4]:
import json
import re
import requests

# Load your existing Pokemon data (assuming it's saved as JSON already)
with open("/app/data/metadata/images.json", "r") as f:
    existing_pokemon_data = json.load(f)

# Fetch the new data with additional attributes from GitHub
pokedex_url = (
    "https://raw.githubusercontent.com/Purukitto/pokemon-data.json/master/pokedex.json"
)
response = requests.get(pokedex_url)
if response.status_code == 200:
    new_pokemon_data = response.json()
else:
    raise Exception(f"Failed to fetch data: HTTP {response.status_code}")

# Create a mapping from lowercase English names to the new data
new_pokemon_map = {}
for pokemon in new_pokemon_data:
    name_lower = pokemon["name"]["english"].lower()
    new_pokemon_map[name_lower] = pokemon


# Function to convert height and weight strings to numeric values
def extract_number(value_str):
    if not value_str:
        return None
    match = re.search(r"(\d+\.?\d*)", value_str)
    if match:
        return float(match.group(1))
    return None


# Update the existing data with the new attributes
for name, data in existing_pokemon_data.items():
    if name in new_pokemon_map:
        new_data = new_pokemon_map[name]

        # Add base stats
        if "base" in new_data:
            data["HP"] = new_data["base"]["HP"]
            data["Attack"] = new_data["base"]["Attack"]
            data["Defense"] = new_data["base"]["Defense"]
            data["Sp_Attack"] = new_data["base"]["Sp. Attack"]
            data["Sp_Defense"] = new_data["base"]["Sp. Defense"]
            data["Speed"] = new_data["base"]["Speed"]

        # Add height and weight as numeric values
        if "profile" in new_data:
            if "height" in new_data["profile"]:
                data["height"] = extract_number(new_data["profile"]["height"])

            if "weight" in new_data["profile"]:
                data["weight"] = extract_number(new_data["profile"]["weight"])

# Save the updated data
with open(METADATA_PATH, "w") as f:
    json.dump(existing_pokemon_data, f, indent=2)

print("Updated Pokemon data saved to 'updated_pokemon_metadata.json'")

# Print a sample of the updated data structure
sample_pokemon = list(existing_pokemon_data.keys())[0]
print(f"\nSample updated data for {sample_pokemon}:")
print(json.dumps(existing_pokemon_data[sample_pokemon], indent=2))

Updated Pokemon data saved to 'updated_pokemon_metadata.json'

Sample updated data for bulbasaur:
{
  "type1": "Grass",
  "type2": "Poison",
  "evolution": "ivysaur",
  "HP": 45,
  "Attack": 49,
  "Defense": 49,
  "Sp_Attack": 65,
  "Sp_Defense": 65,
  "Speed": 45,
  "height": 0.7,
  "weight": 6.9
}


In [5]:
print("\n=== Dataset Validation ===")

# Check images
image_files = [
    f.lower()
    for f in os.listdir(IMAGE_DIR)
    if f.lower().endswith((".png", ".jpg", ".jpeg"))
]
print(f"\nFound {len(image_files)} images")

# Check metadata
with open(METADATA_PATH) as f:
    metadata = json.load(f)
print(f"Found {len(metadata)} pokemon in metadata")

# Cross-reference
images_without_metadata = []
metadata_without_images = []

# Check for images without metadata
for img_file in image_files:
    pokemon_name = os.path.splitext(img_file)[0].lower()
    if pokemon_name not in metadata:
        images_without_metadata.append(img_file)

# Check for metadata without images
for pokemon_name in metadata:
    if not any(pokemon_name in img_file for img_file in image_files):
        metadata_without_images.append(pokemon_name)


=== Dataset Validation ===

Found 809 images
Found 809 pokemon in metadata


In [6]:
# Image statistics
print("\n=== Image Statistics ===")
sizes = []
formats = set()
corrupt_images = []

for img_file in image_files:
    img_path = os.path.join(IMAGE_DIR, img_file)
    try:
        with Image.open(img_path) as img:
            sizes.append(img.size)
            formats.add(img.format)
    except Exception as e:
        corrupt_images.append((img_file, str(e)))

unique_sizes = set(sizes)
print(f"\nFound {len(unique_sizes)} different image sizes:")
for size in list(unique_sizes)[:5]:
    count = sizes.count(size)
    print(f"- {size}: {count} images")
if len(unique_sizes) > 5:
    print("...")

print(f"\nImage formats: {', '.join(formats)}")

if corrupt_images:
    print("\n=== Corrupt Images ===")
    print(f"Found {len(corrupt_images)} corrupt images:")
    for img, error in corrupt_images[:5]:
        print(f"- {img}: {error}")
    if len(corrupt_images) > 5:
        print("...")

# Type statistics
print("\n=== Type Statistics ===")
primary_types = df["Type1"].value_counts()
secondary_types = df["Type2"].value_counts()

print("\nPrimary Types:")
for type_name, count in primary_types.items():
    print(f"- {type_name}: {count}")

print("\nSecondary Types:")
for type_name, count in secondary_types.items():
    if pd.notna(type_name):
        print(f"- {type_name}: {count}")


=== Image Statistics ===

Found 1 different image sizes:
- (120, 120): 809 images

Image formats: PNG

=== Type Statistics ===

Primary Types:
- Water: 114
- Normal: 105
- Grass: 78
- Bug: 72
- Fire: 53
- Psychic: 53
- Rock: 46
- Electric: 40
- Poison: 34
- Ground: 32
- Dark: 29
- Fighting: 29
- Ghost: 27
- Dragon: 27
- Steel: 26
- Ice: 23
- Fairy: 18
- Flying: 3

Secondary Types:
- Flying: 95
- Poison: 32
- Ground: 32
- Fairy: 29
- Psychic: 29
- Fighting: 25
- Steel: 23
- Grass: 19
- Dragon: 18
- Water: 17
- Dark: 17
- Ghost: 16
- Rock: 14
- Fire: 11
- Ice: 11
- Electric: 8
- Bug: 5
- Normal: 4
