<a href="https://colab.research.google.com/github/ali-vayani/PokeGen/blob/main/Datascraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
from bs4 import BeautifulSoup
import os
from PIL import Image, ImageOps
import csv
import base64
import re
import pandas as pd

# Getting all the pokemon links

In [None]:
response = requests.get('https://pokemondb.net/pokedex/all')
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.select('table#pokedex tbody tr')
pokemon_links = []
for row in rows:
  link_tag = row.find('a', href=True) # gets links
  if link_tag:
    pokemon_links.append('https://pokemondb.net' + link_tag['href'])

pokemon_links = list(sorted(set(pokemon_links)))
print(len(pokemon_links))


1025


# Scraping text & images + adding everything to CSV

In [None]:
# some pokemon have multiple forms or have different names in url
special_cases = {
    "Eiscue": [
        'https://img.pokemondb.net/artwork/large/eiscue-noice.jpg',
        'https://img.pokemondb.net/artwork/large/eiscue-ice.jpg'
    ],
    "Enamorus": [
        'https://img.pokemondb.net/artwork/large/enamorus-therian.jpg',
        'https://img.pokemondb.net/artwork/large/enamorus-incarnate.jpg'
    ],
    "Flabébé": ['https://img.pokemondb.net/artwork/large/flabebe.jpg'],
    "Giratina": [
        'https://img.pokemondb.net/artwork/large/giratina-origin.jpg',
        'https://img.pokemondb.net/artwork/large/giratina-altered.jpg'
    ],
    "Lycanroc": [
        'https://img.pokemondb.net/artwork/large/lycanroc-midnight.jpg',
        'https://img.pokemondb.net/artwork/large/lycanroc-midday.jpg'
    ],
    "Morpeko": [
        'https://img.pokemondb.net/artwork/large/morpeko-full-belly.jpg',
        'https://img.pokemondb.net/artwork/large/morpeko-hangry.jpg'
    ],
    "Nidoran♀ (female)": ['https://img.pokemondb.net/artwork/large/nidoran-f.jpg'],
    "Nidoran♂ (male)": ['https://img.pokemondb.net/artwork/large/nidoran-m.jpg'],
    "Oricorio": [
        'https://img.pokemondb.net/artwork/large/oricorio-baile.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-pom-pom.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-pau.jpg',
        'https://img.pokemondb.net/artwork/large/oricorio-sensu.jpg'
    ],
    "Shaymin": [
        'https://img.pokemondb.net/artwork/large/shaymin-land.jpg',
        'https://img.pokemondb.net/artwork/large/shaymin-sky.jpg'
    ],
    "Urshifu": ['https://img.pokemondb.net/artwork/large/urshifu-single-strike.jpg'],
    "Wishiwashi": [
        'https://img.pokemondb.net/artwork/large/wishiwashi-solo.jpg',
        'https://img.pokemondb.net/artwork/large/wishiwashi-school.jpg'
    ]
}

In [None]:
def clean_name(name):
    name = name.replace("♀", "").replace("♂", "")
    name = name.replace(" ", "-")
    name = re.sub(r'[^a-zA-Z0-9-]', '', name)
    return name.lower()


# resize func
def resize_image_to_512(img_path, output_path):
    with Image.open(img_path) as img:
        img.thumbnail((512, 512))
        canvas = Image.new('RGB', (512, 512), (255, 255, 255))
        paste_x = (512 - img.width) // 2
        paste_y = (512 - img.height) // 2
        canvas.paste(img, (paste_x, paste_y))
        canvas.save(output_path)

In [None]:
def process_pokemon(name, img_url, type1, type2, csvwriter):
    try:
        response = requests.get(img_url)
        response.raise_for_status()

        temp_img_path = f'pokemon_images/{name}_original.jpg'
        final_img_path = f'pokemon_images/{name}.jpg'
        with open(temp_img_path, 'wb') as img_file:
            img_file.write(response.content)
        resize_image_to_512(temp_img_path, final_img_path)
        os.remove(temp_img_path)
        description = image_label(img_url)

        # Write to CSV
        csvwriter.writerow([name, final_img_path, type1, type2, description])
        #print(f"Processed {name} with default URL")
    except requests.exceptions.RequestException:
        # If the default URL fails, try special cases
        if name in special_cases:
            for special_url in special_cases[name]:
                try:
                    response = requests.get(special_url)
                    response.raise_for_status()

                    # save and resize the image
                    temp_img_path = f'pokemon_images/{special_url[40:]}_special_original.jpg'
                    final_img_path = f'pokemon_images/{special_url[40:]}'
                    with open(temp_img_path, 'wb') as img_file:
                        img_file.write(response.content)
                    resize_image_to_512(temp_img_path, final_img_path)
                    os.remove(temp_img_path)
                    description = image_label(special_url)

                    # Write to CSV
                    csvwriter.writerow([name, final_img_path, type1, type2, description])
                    return
                except requests.exceptions.RequestException:
                    continue
        print(f"Failed to process {name}. No valid image found.")

In [None]:
import csv

os.makedirs('pokemon_images', exist_ok=True)

# CSV file setup
csv_filename = 'pokemon_data.csv'
csv_headers = ['Name', 'Image Path', 'Type 1', 'Type 2', 'Description']
with open(csv_filename, mode='w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(csv_headers)

    # gets info + downloads image + writes to CSV for all pokemon
    for url in pokemon_links:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 1. name
        name = soup.select_one('h1').text.strip()

        # 2. img urls
        img_url = f"https://img.pokemondb.net/artwork/{clean_name(name)}.jpg"

        # 3. types
        rows = soup.select('tr')
        second_row = rows[1] if len(rows) > 1 else None
        type_links = second_row.select('a') if second_row else None
        type1 = type_links[0].text.strip() if len(type_links) > 0 else None
        type2 = type_links[1].text.strip() if len(type_links) > 1 else None

        process_pokemon(name, img_url, type1, type2, csvwriter)

In [None]:
from openai import OpenAI
client = OpenAI(
  api_key="***"
)
def image_label(image_url):
  response = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
          {
              "role": "user",
              "content": [
                  {"type": "text", "text": "you're an assistant that creates a label for an image. this label should be written in the form of a text entry for a diffusion model."},
                  {
                      "type": "image_url",
                      "image_url": {
                          "url": image_url,
                      },
                  },
              ],
          }
      ],
      max_tokens=200,
  )
  return response.choices[0].message.content

# idfk anymore

In [1]:
!git init
!git remote add origin https://github.com/ali-vayani/PokeGen.git
!git sparse-checkout init --no-cone
!git sparse-checkout set Data
!git pull origin main

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/
remote: Enumerating objects: 1056, done.[K
remote: Counting objects: 100% (1056/1056), done.[K
remote: Compressing objects: 100% (1049/1049), done.[K
remote: Total 1056 (delta 12), reused 1035 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (1056/1056), 17.35 MiB | 29.42 MiB/s, done.
Resolving deltas: 100% (12/12), done.
From https://github.com/ali-vayani/PokeGen
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> o

In [45]:
df = pd.read_csv('Data/pokemon_data.csv')
columns_to_keep = ['Image Path', 'Description']
new_df = df[columns_to_keep].copy()

new_df.to_csv('poke_data.csv', index=False)

new_df.head()

Unnamed: 0,Image Path,Description
0,pokemon_images/Abomasnow.jpg,Frosty creature with white fur and green accen...
1,pokemon_images/Abra.jpg,"A yellow, psychic-type Pokémon with large, exp..."
2,pokemon_images/Absol.jpg,"A sleek, mythical creature resembling a canine..."
3,pokemon_images/Accelgor.jpg,"A colorful, cartoony octopus-like creature wit..."
4,pokemon_images/Aegislash.jpg,"Aegislash, a Pokémon that resembles a medieval..."


# pushing to hugging face

In [6]:
pip install datasets huggingface_hub

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [22]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')
login(token=hf_token)

In [48]:
new_df.rename(columns={"Image Path": "image", "Description": "description"}, inplace=True)
new_df['image'] = new_df['image'].str.replace("data/pokemon_images/", "", regex=False)
new_df.head()

Unnamed: 0,image,description
0,Abomasnow.jpg,Frosty creature with white fur and green accen...
1,Abra.jpg,"A yellow, psychic-type Pokémon with large, exp..."
2,Absol.jpg,"A sleek, mythical creature resembling a canine..."
3,Accelgor.jpg,"A colorful, cartoony octopus-like creature wit..."
4,Aegislash.jpg,"Aegislash, a Pokémon that resembles a medieval..."


In [55]:
from datasets import Dataset, Features, Value, Image

features = Features({"image": Image(), "description": Value("string")})
dataset = Dataset.from_pandas(new_df, features=features)

# Push the dataset to Hugging Face
dataset.push_to_hub("pAstaWasTaken/poke-data")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1021 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/pAstaWasTaken/poke-data/commit/bf597dd0ecda7822768d9819d48f597b3abf13ff', commit_message='Upload dataset', commit_description='', oid='bf597dd0ecda7822768d9819d48f597b3abf13ff', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/pAstaWasTaken/poke-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='pAstaWasTaken/poke-data'), pr_revision=None, pr_num=None)