# Data Loading

In [15]:
import os
import pandas as pd
from pathlib import Path
import json
from PIL import Image
import shutil


# Define absolute paths
NOTEBOOK_DIR = Path(os.getcwd())
REPO_ROOT = NOTEBOOK_DIR.parent  # Go up one level from notebooks dir
DATASET_PATH = REPO_ROOT / "data"
IMAGE_DIR = DATASET_PATH / "images"
METADATA_PATH = DATASET_PATH / "metadata" / "pokemon_types.json"

# Create directories if they don't exist
IMAGE_DIR.mkdir(parents=True, exist_ok=True)
METADATA_PATH.parent.mkdir(parents=True, exist_ok=True)

In [16]:
#%pip install kagglehub
import kagglehub

# Download dataset
kaggle_path = kagglehub.dataset_download("vishalsubbiah/pokemon-images-and-types")
print("Dataset downloaded to:", kaggle_path)

# The kaggle paths
KAGGLE_IMAGES = Path(kaggle_path) / "images"
KAGGLE_CSV = Path(kaggle_path) / "pokemon.csv"

print("\n=== Paths ===")
print(f"Current working directory: {NOTEBOOK_DIR}")
print(f"Repository root: {REPO_ROOT}")
print(f"Dataset path: {DATASET_PATH}")
print(f"Image directory: {IMAGE_DIR}")
print(f"Metadata path: {METADATA_PATH}")

Dataset downloaded to: /root/.cache/kagglehub/datasets/vishalsubbiah/pokemon-images-and-types/versions/4

=== Paths ===
Current working directory: /app/notebooks
Repository root: /app
Dataset path: /app/data
Image directory: /app/data/images
Metadata path: /app/data/metadata/pokemon_types.json


In [17]:
# Define our paths
# DATASET_PATH = "data"
# IMAGE_DIR = os.path.join(DATASET_PATH, "images")
# METADATA_PATH = os.path.join(DATASET_PATH, "metadata", "pokemon_types.json")

# The kaggle paths (these are now under the downloaded path)
KAGGLE_IMAGES = os.path.join(kaggle_path, "images")
KAGGLE_CSV = os.path.join(kaggle_path, "pokemon.csv")

print("\n=== Loading Data ===")
print(f"Loading CSV from: {KAGGLE_CSV}")
print(f"Loading images from: {KAGGLE_IMAGES}")


=== Loading Data ===
Loading CSV from: /root/.cache/kagglehub/datasets/vishalsubbiah/pokemon-images-and-types/versions/4/pokemon.csv
Loading images from: /root/.cache/kagglehub/datasets/vishalsubbiah/pokemon-images-and-types/versions/4/images


In [18]:
# Load and process the CSV data
df = pd.read_csv(KAGGLE_CSV)
print(f"\nLoaded {len(df)} Pokemon from CSV")

# Clean the data and create a JSON structure
pokemon_data = {}
for _, row in df.iterrows():
    pokemon_data[row['Name'].lower()] = {
        'name': row['Name'],
        'type1': row['Type1'],
        'type2': row['Type2'] if pd.notna(row['Type2']) else None,
        'evolution': row['Evolution'] if pd.notna(row['Evolution']) else None
    }

# Save metadata as JSON
os.makedirs(os.path.dirname(METADATA_PATH), exist_ok=True)
with open(METADATA_PATH, 'w') as f:
    json.dump(pokemon_data, f, indent=2)
print(f"Saved metadata to: {METADATA_PATH}")

# Copy images to our data directory
os.makedirs(IMAGE_DIR, exist_ok=True)
image_count = 0
for img_file in os.listdir(KAGGLE_IMAGES):
    if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
        src = os.path.join(KAGGLE_IMAGES, img_file)
        dst = os.path.join(IMAGE_DIR, img_file.lower())
        shutil.copy2(src, dst)
        image_count += 1
print(f"Copied {image_count} images to: {IMAGE_DIR}")


Loaded 809 Pokemon from CSV
Saved metadata to: /app/data/metadata/pokemon_types.json
Copied 809 images to: /app/data/images


In [None]:
print("\n=== Dataset Validation ===")

# Check images
image_files = [f.lower() for f in os.listdir(IMAGE_DIR) 
               if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
print(f"\nFound {len(image_files)} images")

# Check metadata
with open(METADATA_PATH) as f:
    metadata = json.load(f)
print(f"Found {len(metadata)} pokemon in metadata")

# Cross-reference
images_without_metadata = []
metadata_without_images = []

# Check for images without metadata
for img_file in image_files:
    pokemon_name = os.path.splitext(img_file)[0].lower()
    if pokemon_name not in metadata:
        images_without_metadata.append(img_file)

# Check for metadata without images
for pokemon_name in metadata:
    if not any(pokemon_name in img_file for img_file in image_files):
        metadata_without_images.append(pokemon_name)

In [None]:

# Image statistics
print("\n=== Image Statistics ===")
sizes = []
formats = set()
corrupt_images = []

for img_file in image_files:
    img_path = os.path.join(IMAGE_DIR, img_file)
    try:
        with Image.open(img_path) as img:
            sizes.append(img.size)
            formats.add(img.format)
    except Exception as e:
        corrupt_images.append((img_file, str(e)))

unique_sizes = set(sizes)
print(f"\nFound {len(unique_sizes)} different image sizes:")
for size in list(unique_sizes)[:5]:
    count = sizes.count(size)
    print(f"- {size}: {count} images")
if len(unique_sizes) > 5:
    print("...")

print(f"\nImage formats: {', '.join(formats)}")

if corrupt_images:
    print("\n=== Corrupt Images ===")
    print(f"Found {len(corrupt_images)} corrupt images:")
    for img, error in corrupt_images[:5]:
        print(f"- {img}: {error}")
    if len(corrupt_images) > 5:
        print("...")

# Type statistics
print("\n=== Type Statistics ===")
primary_types = df['Type1'].value_counts()
secondary_types = df['Type2'].value_counts()

print("\nPrimary Types:")
for type_name, count in primary_types.items():
    print(f"- {type_name}: {count}")

print("\nSecondary Types:")
for type_name, count in secondary_types.items():
    if pd.notna(type_name):
        print(f"- {type_name}: {count}")