### The purpose of this notebook is to use BeautifulSoup to create files for Pokemon DB

In [113]:
%%time
import numpy as np
import pandas as pd
import bs4
import requests as rq
import re

CPU times: user 10 µs, sys: 35 µs, total: 45 µs
Wall time: 47.2 µs


### The base website we'll scrape from

In [57]:
%%time
url_pokedex = 'https://pokemondb.net/pokedex/national'
url_base = 'https://pokemondb.net/pokedex/'
raw_pokedex = rq.get(url_pokedex)
bs_pokedex = bs4.BeautifulSoup(raw_pokedex.text,'html.parser')

CPU times: user 302 ms, sys: 22.3 ms, total: 324 ms
Wall time: 450 ms


### Print some of the scraped site so we can see the html format

In [23]:
print(bs_pokedex.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Pokémon (sprites gallery) | Pokémon Database
  </title>
  <link href="https://img.pokemondb.net" rel="preconnect"/>
  <link href="https://s.pokemondb.net" rel="preconnect"/>
  <link as="font" crossorigin="" href="/static/fonts/fira-sans-v17-latin-400.woff2" rel="preload" type="font/woff2"/>
  <link as="font" crossorigin="" href="/static/fonts/fira-sans-v17-latin-400i.woff2" rel="preload" type="font/woff2"/>
  <link as="font" crossorigin="" href="/static/fonts/fira-sans-v17-latin-600.woff2" rel="preload" type="font/woff2"/>
  <link href="/static/css/pokemondb-aa70195104.css" rel="stylesheet"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="A simple list of all 1025 Pokémon by National Dex number, with images." name="description" property="og:description"/>
  <link href="https://pokemondb.net/pokedex/national" rel="canonical"/>
  <meta content="https://pokemon

### Parse Pokemon Names

In [45]:
pokemon_names = bs_pokedex.find_all('a', class_='ent-name')
len(pokemon_names)

1025

### Name and number will be the first columns in the Pandas data frame

In [54]:
pokemon_df = pd.DataFrame()
numbers = [0]*len(pokemon_names)
names = ['']*len(pokemon_names)
for n in range(len(pokemon_names)):
    numbers[n]=n+1
    names[n] = pokemon_names[n].get_text()
pokemon_df['Number'] = numbers
pokemon_df['Name'] = names
pokemon_df.shape



(1025, 2)

In [56]:
pokemon_df.tail(10)

Unnamed: 0,Number,Name
1015,1016,Fezandipiti
1016,1017,Ogerpon
1017,1018,Archaludon
1018,1019,Hydrapple
1019,1020,Gouging Fire
1020,1021,Raging Bolt
1021,1022,Iron Boulder
1022,1023,Iron Crown
1023,1024,Terapagos
1024,1025,Pecharunt


In [59]:
url_base+pokemon_names[0].get_text()

'https://pokemondb.net/pokedex/Bulbasaur'

In [60]:
raw_bulba = rq.get(url_base+pokemon_names[0].get_text())
bs_bulba = bs4.BeautifulSoup(raw_bulba.text,'html.parser')

In [61]:
print(bs_bulba.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Bulbasaur Pokédex: stats, moves, evolution &amp; locations | Pokémon Database
  </title>
  <link href="https://img.pokemondb.net" rel="preconnect"/>
  <link href="https://s.pokemondb.net" rel="preconnect"/>
  <link as="font" crossorigin="" href="/static/fonts/fira-sans-v17-latin-400.woff2" rel="preload" type="font/woff2"/>
  <link as="font" crossorigin="" href="/static/fonts/fira-sans-v17-latin-400i.woff2" rel="preload" type="font/woff2"/>
  <link as="font" crossorigin="" href="/static/fonts/fira-sans-v17-latin-600.woff2" rel="preload" type="font/woff2"/>
  <link href="/static/css/pokemondb-aa70195104.css" rel="stylesheet"/>
  <link href="/static/css/type-chart-76998cbd3d.css" rel="stylesheet"/>
  <link href="/static/css/evolution-6ccf58cfbe.css" rel="stylesheet"/>
  <style>
   .cell-barchart{width:100%;min-width:150px}.barchart-bar{height:.75rem;border-radius:4px;background-color:#a3a3a3;border:1px solid #

In [93]:
bs_bulba.find("th", string="Species").next_sibling.next_sibling.string

'Seed Pokémon'

In [121]:
bs_bulba.find_all('a',{'href':re.compile(r'/type/')})[1].string

'Grass'

In [122]:
bs_bulba.find_all('a',{'href':re.compile(r'/type/')})[2].string

'Poison'

In [124]:
bs_bulba.find('strong').string

'0001'

In [131]:
bs_bulba.find_all('a',{'href':re.compile(r'/ability/')})[0].string

'Overgrow'

In [133]:
bs_bulba.find_all('a',{'href':re.compile(r'/ability/')})[1].string

'Chlorophyll'

In [139]:
bs_bulba.find("th", string="EV yield").next_sibling.next_sibling.string[1:]

'1 Sp. Atk '

In [146]:
bs_bulba.find("th", string="Catch rate").next_sibling.next_sibling.text[1:3]

'45'

In [148]:
bs_bulba.find("th", string="Growth Rate").next_sibling.next_sibling.text

'Medium Slow'

In [149]:
bs_bulba.find("th", string="HP").next_sibling.next_sibling.text

'45'

In [150]:
bs_bulba.find("th", string="Attack").next_sibling.next_sibling.text

'49'

In [151]:
bs_bulba.find("th", string="Defense").next_sibling.next_sibling.text

'49'

In [153]:
bs_bulba.find("th", string="Sp. Atk").next_sibling.next_sibling.text

'65'

In [154]:
bs_bulba.find("th", string="Sp. Def").next_sibling.next_sibling.text

'65'

In [155]:
bs_bulba.find("th", string="Speed").next_sibling.next_sibling.text

'45'