<a href="https://colab.research.google.com/github/ali-vayani/PokeGen/blob/main/Datascraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
pip install openai

Collecting openai
  Downloading openai-1.59.6-py3-none-any.whl.metadata (27 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.5-py3-none-any.whl.metadata (30 kB)
Collecting tqdm>4 (from openai)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.27.2 (from pydantic<3,>=1.9.0->openai)
  Downloading pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading openai-1.59.6-py3-none-any.whl (454 kB)
Downloading distro-1.9.0-p

In [1]:
import requests
from bs4 import BeautifulSoup
import os
from PIL import Image, ImageOps
import csv
import base64
import re

# Getting all the pokemon links

In [2]:
response = requests.get('https://pokemondb.net/pokedex/all')
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.select('table#pokedex tbody tr')
pokemon_links = []
for row in rows:
  link_tag = row.find('a', href=True) # gets links
  if link_tag:
    pokemon_links.append('https://pokemondb.net' + link_tag['href'])

pokemon_links = list(sorted(set(pokemon_links)))
print(len(pokemon_links))


1025


# Scraping text & images + adding everything to CSV

In [3]:
# some pokemon have multiple forms or have different names in url
special_cases = {
    "Eiscue": [
        'https://img.pokemondb.net/artwork/large/eiscue-noice.jpg',
        'https://img.pokemondb.net/artwork/large/eiscue-ice.jpg'
    ],
    "Enamorus": [
        'https://img.pokemondb.net/artwork/large/enamorus-therian.jpg',
        'https://img.pokemondb.net/artwork/large/enamorus-incarnate.jpg'
    ],
    "Flabébé": ['https://img.pokemondb.net/artwork/large/flabebe.jpg'],
    "Giratina": [
        'https://img.pokemondb.net/artwork/large/giratina-origin.jpg',
        'https://img.pokemondb.net/artwork/large/giratina-altered.jpg'
    ],
    "Lycanroc": [
        'https://img.pokemondb.net/artwork/large/lycanroc-midnight.jpg',
        'https://img.pokemondb.net/artwork/large/lycanroc-midday.jpg'
    ],
    "Morpeko": [
        'https://img.pokemondb.net/artwork/large/morpeko-full-belly.jpg',
        'https://img.pokemondb.net/artwork/large/morpeko-hangry.jpg'
    ],
    "Nidoran♀ (female)": ['https://img.pokemondb.net/artwork/large/nidoran-f.jpg'],
    "Nidoran♂ (male)": ['https://img.pokemondb.net/artwork/large/nidoran-m.jpg'],
    "Oricorio": [
        'https://img.pokemondb.net/artwork/large/oricorio-baile.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-pom-pom.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-pau.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-sensu.jpg'
    ],
    "Shaymin": [
        'https://img.pokemondb.net/artwork/large/shaymin-land.jpg',
        'https://img.pokemondb.net/artwork/large/shaymin-sky.jpg'
    ],
    "Urshifu": ['https://img.pokemondb.net/artwork/large/urshifu-single-strike.jpg'],
    "Wishiwashi": [
        'https://img.pokemondb.net/artwork/large/wishiwashi-solo.jpg',
        'https://img.pokemondb.net/artwork/large/wishiwashi-school.jpg'
    ]
}

In [4]:
def clean_name(name):
    name = name.replace("♀", "").replace("♂", "")
    name = name.replace(" ", "-")
    name = re.sub(r'[^a-zA-Z0-9-]', '', name)
    return name.lower()


# resize func
def resize_image_to_512(img_path, output_path):
    with Image.open(img_path) as img:
        img.thumbnail((512, 512))
        canvas = Image.new('RGB', (512, 512), (255, 255, 255))
        paste_x = (512 - img.width) // 2
        paste_y = (512 - img.height) // 2
        canvas.paste(img, (paste_x, paste_y))
        canvas.save(output_path)

In [5]:
def process_pokemon(name, img_url, type1, type2, csvwriter):
    try:
        response = requests.get(img_url)
        response.raise_for_status()

        temp_img_path = f'pokemon_images/{name}_original.jpg'
        final_img_path = f'pokemon_images/{name}.jpg'
        with open(temp_img_path, 'wb') as img_file:
            img_file.write(response.content)
        resize_image_to_512(temp_img_path, final_img_path)
        os.remove(temp_img_path)
        description = image_label(img_url)

        # Write to CSV
        csvwriter.writerow([name, final_img_path, type1, type2, description])
        #print(f"Processed {name} with default URL")
    except requests.exceptions.RequestException:
        # If the default URL fails, try special cases
        if name in special_cases:
            for special_url in special_cases[name]:
                try:
                    response = requests.get(special_url)
                    response.raise_for_status()

                    # save and resize the image
                    temp_img_path = f'pokemon_images/{special_url[40:]}_special_original.jpg'
                    final_img_path = f'pokemon_images/{special_url[40:]}'
                    with open(temp_img_path, 'wb') as img_file:
                        img_file.write(response.content)
                    resize_image_to_512(temp_img_path, final_img_path)
                    os.remove(temp_img_path)
                    description = image_label(special_url)

                    # Write to CSV
                    csvwriter.writerow([name, final_img_path, type1, type2, description])
                    return
                except requests.exceptions.RequestException:
                    continue
        print(f"Failed to process {name}. No valid image found.")

In [39]:
import csv

os.makedirs('pokemon_images', exist_ok=True)

# CSV file setup
csv_filename = 'pokemon_data.csv'
csv_headers = ['Name', 'Image Path', 'Type 1', 'Type 2', 'Description']
with open(csv_filename, mode='w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(csv_headers)

    # gets info + downloads image + writes to CSV for all pokemon
    for url in pokemon_links[956:]:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 1. name
        name = soup.select_one('h1').text.strip()

        # 2. img urls
        img_url = f"https://img.pokemondb.net/artwork/{clean_name(name)}.jpg"

        # 3. types
        rows = soup.select('tr')
        second_row = rows[1] if len(rows) > 1 else None
        type_links = second_row.select('a') if second_row else None
        type1 = type_links[0].text.strip() if len(type_links) > 0 else None
        type2 = type_links[1].text.strip() if len(type_links) > 1 else None

        process_pokemon(name, img_url, type1, type2, csvwriter)

In [29]:
from openai import OpenAI
client = OpenAI(
    api_key="..."
)
def image_label(image_url):
  response = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
          {
              "role": "user",
              "content": [
                  {"type": "text", "text": "you're an assistant that creates a label for an image. this label should be written in the form of a text entry for a diffusion model."},
                  {
                      "type": "image_url",
                      "image_url": {
                          "url": image_url,
                      },
                  },
              ],
          }
      ],
      max_tokens=200,
  )
  return response.choices[0].message.content