<a href="https://colab.research.google.com/github/ali-vayani/PokeGen/blob/main/Datascraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [71]:
import requests
from bs4 import BeautifulSoup
import os
from PIL import Image, ImageOps
import csv
import base64
import re

# Getting all the pokemon links

In [51]:
response = requests.get('https://pokemondb.net/pokedex/all')
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.select('table#pokedex tbody tr')
pokemon_links = []
for row in rows:
  link_tag = row.find('a', href=True) # gets links
  if link_tag:
    pokemon_links.append('https://pokemondb.net' + link_tag['href'])

pokemon_links = list(sorted(set(pokemon_links)))
print(len(pokemon_links))


1025


# Scraping text & images + adding everything to CSV

In [140]:
import csv

os.makedirs('pokemon_images', exist_ok=True)

# CSV file setup
csv_filename = 'pokemon_data.csv'
csv_headers = ['Name', 'Image Path', 'Type 1', 'Type 2']
with open(csv_filename, mode='w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(csv_headers)

    # gets info + downloads image + writes to CSV for all pokemon
    for url in pokemon_links:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 1. name
        name = soup.select_one('h1').text.strip()

        # 2. img urls
        img_url = f"https://img.pokemondb.net/artwork/{clean_name(name)}.jpg"

        # 3. types
        rows = soup.select('tr')
        second_row = rows[1] if len(rows) > 1 else None
        type_links = second_row.select('a') if second_row else None
        type1 = type_links[0].text.strip() if len(type_links) > 0 else None
        type2 = type_links[1].text.strip() if len(type_links) > 1 else None

        process_pokemon(name, img_url, type1, type2, csvwriter)

Processed Abomasnow with default URL
Processed Abra with default URL
Processed Absol with default URL
Processed Accelgor with default URL
Processed Aegislash with default URL
Processed Aerodactyl with default URL
Processed Aggron with default URL
Processed Aipom with default URL
Processed Alakazam with default URL
Processed Alcremie with default URL
Processed Alomomola with default URL
Processed Altaria with default URL
Processed Amaura with default URL
Processed Ambipom with default URL
Processed Amoonguss with default URL
Processed Ampharos with default URL
Processed Annihilape with default URL
Processed Anorith with default URL
Processed Appletun with default URL
Processed Applin with default URL
Processed Araquanid with default URL
Processed Arbok with default URL
Processed Arboliva with default URL
Processed Arcanine with default URL
Processed Arceus with default URL
Processed Archaludon with default URL
Processed Archen with default URL
Processed Archeops with default URL
Process

In [136]:
# some pokemon have multiple forms or have different names in url
special_cases = {
    "Eiscue": [
        'https://img.pokemondb.net/artwork/large/eiscue-noice.jpg',
        'https://img.pokemondb.net/artwork/large/eiscue-ice.jpg'
    ],
    "Enamorus": [
        'https://img.pokemondb.net/artwork/large/enamorus-therian.jpg',
        'https://img.pokemondb.net/artwork/large/enamorus-incarnate.jpg'
    ],
    "Flabébé": ['https://img.pokemondb.net/artwork/large/flabebe.jpg'],
    "Giratina": [
        'https://img.pokemondb.net/artwork/large/giratina-origin.jpg',
        'https://img.pokemondb.net/artwork/large/giratina-altered.jpg'
    ],
    "Lycanroc": [
        'https://img.pokemondb.net/artwork/large/lycanroc-midnight.jpg',
        'https://img.pokemondb.net/artwork/large/lycanroc-midday.jpg'
    ],
    "Morpeko": [
        'https://img.pokemondb.net/artwork/large/morpeko-full-belly.jpg',
        'https://img.pokemondb.net/artwork/large/morpeko-hangry.jpg'
    ],
    "Nidoran♀ (female)": ['https://img.pokemondb.net/artwork/large/nidoran-f.jpg'],
    "Nidoran♂ (male)": ['https://img.pokemondb.net/artwork/large/nidoran-m.jpg'],
    "Oricorio": [
        'https://img.pokemondb.net/artwork/large/oricorio-baile.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-pom-pom.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-pau.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-sensu.jpg'
    ],
    "Shaymin": [
        'https://img.pokemondb.net/artwork/large/shaymin-land.jpg',
        'https://img.pokemondb.net/artwork/large/shaymin-sky.jpg'
    ],
    "Urshifu": ['https://img.pokemondb.net/artwork/large/urshifu-single-strike.jpg'],
    "Wishiwashi": [
        'https://img.pokemondb.net/artwork/large/wishiwashi-solo.jpg',
        'https://img.pokemondb.net/artwork/large/wishiwashi-school.jpg'
    ]
}

In [119]:
def clean_name(name):
    name = name.replace("♀", "").replace("♂", "")
    name = name.replace(" ", "-")
    name = re.sub(r'[^a-zA-Z0-9-]', '', name)
    return name.lower()


# resize func
def resize_image_to_512(img_path, output_path):
    with Image.open(img_path) as img:
        img.thumbnail((512, 512))
        canvas = Image.new('RGB', (512, 512), (255, 255, 255))
        paste_x = (512 - img.width) // 2
        paste_y = (512 - img.height) // 2
        canvas.paste(img, (paste_x, paste_y))
        canvas.save(output_path)

In [138]:
def process_pokemon(name, img_url, type1, type2, csvwriter):
    try:
        response = requests.get(img_url)
        response.raise_for_status()

        temp_img_path = f'pokemon_images/{name}_original.jpg'
        final_img_path = f'pokemon_images/{name}.jpg'
        with open(temp_img_path, 'wb') as img_file:
            img_file.write(response.content)
        resize_image_to_512(temp_img_path, final_img_path)
        os.remove(temp_img_path)

        # Write to CSV
        csvwriter.writerow([name, final_img_path, type1, type2])
        #print(f"Processed {name} with default URL")
    except requests.exceptions.RequestException:
        # If the default URL fails, try special cases
        if name in special_cases:
            for special_url in special_cases[name]:
                try:
                    response = requests.get(special_url)
                    response.raise_for_status()

                    # save and resize the image
                    temp_img_path = f'pokemon_images/{special_url[40:]}_special_original.jpg'
                    final_img_path = f'pokemon_images/{special_url[40:]}'
                    with open(temp_img_path, 'wb') as img_file:
                        img_file.write(response.content)
                    resize_image_to_512(temp_img_path, final_img_path)
                    os.remove(temp_img_path)

                    # Write to CSV
                    csvwriter.writerow([name, final_img_path, type1, type2])
                    #print(f"Processed {name} with special case URL: {special_url}")
                    return
                except requests.exceptions.RequestException:
                    continue
        print(f"Failed to process {name}. No valid image found.")