# Download the JSON files

In [None]:
import requests
import json
import os
from concurrent.futures import ThreadPoolExecutor

# Create directory to save pokemons
os.makedirs("pokemons", exist_ok=True)

# Fetch data
url = "https://pokeapi.co/api/v2/pokemon?limit=100000&offset=0"
response = requests.get(url)
all_pkmns = response.json()
pokelist = all_pkmns['results']

def fetch_and_save_pokemon_data(pokemon):
    """Fetch and save pokemon data."""
    try:
        # URLs for species and pokemon data
        url_pokemon = pokemon['url']
        url_species = f"https://pokeapi.co/api/v2/pokemon-species/{pokemon['url'].split('/')[-2]}/"
        
        # Fetch data
        response_species = requests.get(url_species)
        response_pokemon = requests.get(url_pokemon)
        
        # Save data to JSON files
        data = {
            'species': response_species.json(),
            'pokemon': response_pokemon.json()
        }
        
        pkmn_id = data['pokemon']['id']
        
        with open(f"pokemons/{pkmn_id}-{pokemon['name']}.json", "w") as f:
            json.dump(data['pokemon'], f)
        
        with open(f"pokemons/{pkmn_id}-{pokemon['name']}-species.json", "w") as f:
            json.dump(data['species'], f)
        
        print(f"Saved {pkmn_id}-{pokemon['name']}")
    except Exception as e:
        print(f"Failed to fetch/save {pokemon['name']}: {e}")

# Use ThreadPoolExecutor for concurrency
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks
    futures = [
        executor.submit(fetch_and_save_pokemon_data, p)
        for p in pokelist
    ]

    # Wait for all tasks to complete
    for future in futures:
        future.result()  # This will raise exceptions if any occurred

print("All data fetched and saved!")

# Serialize all data from JSONs

In [None]:
from dataclasses import dataclass
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from typing import Optional, List
import json

# Constants
POKEMON_DIR = Path("pokemons")


# Define the Pokemon dataclass.
@dataclass
class Pokemon:
    id: int
    name: str
    type1: str
    type2: Optional[str]
    height: int
    weight: int
    color: str
    gen: int
    is_legendary: bool

def serialize_pokemon(pokemon_id: int) -> Pokemon:
    """
    Given a Pokémon ID, find its corresponding JSON files and return a serialized Pokemon instance.
    It is assumed that for each pokemon_id there are two files in the 'pokemons' directory:
      - The main pokemon data (filename containing '{id}-*.json' but not '-species')
      - The species data (filename containing '{id}-species.json')
    """
    
    # Find files matching the pokemon_id (both normal and species file)
    files = list(POKEMON_DIR.glob(f"{pokemon_id}-*.json"))
    print(f"Serializing files: {files}")
    
    # Identify the main pokemon file and the species file.
    pokemon_file = next((f for f in files if "-species" not in f.name), None)
    species_file = next((f for f in files if "-species" in f.name), None)
    
    if not pokemon_file or not species_file:
        raise FileNotFoundError(f"Required files for Pokemon id {pokemon_id} not found.")
    
    # Load data from the JSON files.
    with pokemon_file.open("r") as p_file, species_file.open("r") as s_file:
        pokemon_data = json.load(p_file)
        species_data = json.load(s_file)
    
    # Extract generation number from the URL.
    gen_url = species_data.get('generation', {}).get('url', '')
    generation = int(gen_url.rstrip('/').split("/")[-1]) if gen_url else 0
    
    # Determine the second type if available.
    type2_value = (
        pokemon_data['types'][1]['type']['name']
        if len(pokemon_data['types']) > 1
        else None
    )
    
    return Pokemon(
        id=pokemon_id,
        name=pokemon_data['name'],
        type1=pokemon_data['types'][0]['type']['name'],
        type2=type2_value,
        height=pokemon_data['height'],
        weight=pokemon_data['weight'],
        color=species_data['color']['name'],
        gen=generation,
        is_legendary=species_data['is_legendary']
    )

# Extract unique Pokémon IDs from all JSON file names.
pokemon_ids = {
    int(f.name.split("-")[0])
    for f in POKEMON_DIR.glob("*.json")
    if f.name[0].isdigit()
}

if not pokemon_ids:
    raise RuntimeError("No pokemon files found in the 'pokemons' directory.")


# Use ThreadPoolExecutor to serialize all Pokémon concurrently.
with ThreadPoolExecutor(max_workers=50) as executor:
    # Create tasks for each pokemon_id in the range 0 to max_pokemon_id (inclusive)
    futures = [executor.submit(serialize_pokemon, i)
               for i in range(min(pokemon_ids), max(pokemon_ids) + 1)]
    
    # Collect results. Note: the order here will correspond to the range order.
    results: List[Pokemon] = [future.result() for future in futures]

# 'results' now contains the list of serialized Pokemon objects.
print(f"Serialized {len(results)} Pokémon.")

# Data to DataFrame

In [None]:
import pandas as pd

# Convert the list of Pokemon objects to a DataFrame.
df = pd.DataFrame(results)
print(df.head())

# CSV Export

In [25]:
df.to_csv("pokemon_data.csv", index=False)