In [32]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

Getting List of Pokemon from Serebii.net

In [2]:
def fetch_html(url):
    response = requests.get(url)
    return response.text

In [3]:
url = 'https://www.serebii.net/pokemon/nationalpokedex.shtml'
html_content = fetch_html(url)

In [29]:
def extract_pokemon_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table', {'class': 'dextable'})

    pokemons = []
    for row in table.findAll('tr')[1:]:  # Skip the first header row
        cells = row.findAll('td')

        if cells:
            number_text = cells[0].text.strip()[1:]  # Remove '#' symbol

            # Check if the number_text can be converted to an integer
            if not number_text.isdigit():
                continue

            number = int(number_text)
            name = cells[3].text.strip()
            type_cell = cells[4]
            type_imgs = type_cell.findAll('img')
            types = [re.findall(r'/type/(\w+)\.gif', img['src'])[0] for img in type_imgs]

            type1 = types[0]
            type2 = types[1] if len(types) > 1 else None

            abilities = cells[5].text.strip()
            base_stats = [int(stat.text.strip()) for stat in cells[6:12]]

            pokemons.append({
                'number': number,
                'name': name,
                'type1': type1,
                'type2': type2,
                'abilities': abilities,go
                'HP': base_stats[0],
                'Att': base_stats[1],
                'Def': base_stats[2],
                'S.Att': base_stats[3],
                'S.Def': base_stats[4],
                'Spd': base_stats[5]
            })

    return pokemons

In [36]:
pokemon_data = extract_pokemon_data(html_content)
df = pd.DataFrame(pokemon_data)

In [37]:
df.head()

Unnamed: 0,number,name,type1,type2,abilities,HP,Att,Def,S.Att,S.Def,Spd
0,1,Bulbasaur,grass,poison,Overgrow Chlorophyll,45,49,49,65,65,45
1,2,Ivysaur,grass,poison,Overgrow Chlorophyll,60,62,63,80,80,60
2,3,Venusaur,grass,poison,Overgrow Chlorophyll,80,82,83,100,100,80
3,4,Charmander,fire,,Blaze Solar Power,39,52,43,60,50,65
4,5,Charmeleon,fire,,Blaze Solar Power,58,64,58,80,65,80


In [35]:
df.to_csv('pokemon_data.csv', index=False)

In [77]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from google_images_search import GoogleImagesSearch
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
import io
import imagehash
import os

In [59]:
API_KEY = 'YOUR API KEY HERE'
CX = 'YOUR CX HERE'

In [81]:
gis = GoogleImagesSearch(API_KEY, CX)

def search_and_download_images(pokemon_name, output_directory="pokemon_images", max_images=10):
    pokemon_output_directory = os.path.join(output_directory, pokemon_name)
    
    if not os.path.exists(pokemon_output_directory):
        os.makedirs(pokemon_output_directory)
    
    search_params = {
        'q': f'{pokemon_name} official artwork',
        'num': 10,
        'imgSize': 'large',
        'fileType': 'jpg|png',
        'imgType': 'photo',
    }
    
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    image_hashes = set()
    start = 1

    while len(image_hashes) < max_images and start <= 100:
        search_params['start'] = start
        gis.search(search_params)
        
        for i, image in enumerate(gis.results()):
            response = session.get(image.url)
            if response.status_code == 200:
                img = Image.open(io.BytesIO(response.content))
                img_hash = imagehash.average_hash(img)
                
                if img_hash not in image_hashes:
                    image_hashes.add(img_hash)
                    with open(f"{pokemon_output_directory}/{pokemon_name}_{len(image_hashes)}.jpg", 'wb') as f:
                        f.write(response.content)
            
            if len(image_hashes) >= max_images:
                break

        start += 10

In [None]:
for pokemon in pokemon_data:
    search_and_download_images(pokemon['name'])