# Getting data directly from a website
This notebook walks you through some steps in collecting data from [Bulbapedia's National Pokedex](https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number) using `requests` and `BeautifulSoup`

### Import `requests` library
This package allows you to get any website's HTML code so that you can extract from it. Let's save the website's URL in the `URL` variable.

In [1]:
import requests
import json

URL="https://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"

### Load the page

In [2]:
page=requests.get(URL)

In [3]:
print(page.content)

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8" />\n<title>List of Pok\xc3\xa9mon by National Pok\xc3\xa9dex number - Bulbapedia, the community-driven Pok\xc3\xa9mon encyclopedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_Pok\xc3\xa9mon_by_National_Pok\xc3\xa9dex_number","wgTitle":"List of Pok\xc3\xa9mon by National Pok\xc3\xa9dex number","wgCurRevisionId":3263733,"wgRevisionId":3263733,"wgArticleId":65356,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Lists of Pok\xc3\xa9mon","Lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTabl

### Parse HTML data

In [4]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, 'html.parser')

### Find all tables that contain Pokemon details

In [5]:
# Get main content <div>
poke_content=soup.find(id='mw-content-text')

# Get all <table> elements
poke_tables=poke_content.find_all('table')

### Get list of Non-First Generation Pokemons

In [6]:
gen2_list=poke_tables[2]
gen2_list=poke_tables[2]
gen3_list=poke_tables[3]
gen4_list=poke_tables[4]
gen5_list=poke_tables[5]
gen6_list=poke_tables[6]
gen7_list=poke_tables[7]
gen8_list=poke_tables[8]

In [7]:
# Check its contents and find where the first Pokemon entry is
gen2_list.contents

['\n',
 <tr>
 <th style="border-top-left-radius: 5px; -moz-border-radius-topleft: 5px; -webkit-border-top-left-radius: 5px; -khtml-border-top-left-radius: 5px; -icab-border-top-left-radius: 5px; -o-border-top-left-radius: 5px; background: #D6D6D6"> <a href="/wiki/List_of_Pok%C3%A9mon_by_Johto_Pok%C3%A9dex_number" title="List of Pokémon by Johto Pokédex number"><span style="color:#000;">Jdex</span></a>
 </th>
 <th style="background: #D6D6D6"> Ndex
 </th>
 <th style="background: #D6D6D6"> MS
 </th>
 <th style="background: #D6D6D6"> Pokémon
 </th>
 <th colspan="2" style="border-top-right-radius: 5px; -moz-border-radius-topright: 5px; -webkit-border-top-right-radius: 5px; -khtml-border-top-right-radius: 5px; -icab-border-top-right-radius: 5px; -o-border-top-right-radius: 5px; background: #D6D6D6"> Type
 </th></tr>,
 '\n',
 <tr style="background:#FFF">
 <td style="font-family:monospace"> #001
 </td>
 <td style="font-family:monospace"> #152
 </td>
 <th> <a href="/wiki/Chikorita_(Pok%C3%A9mon

In [8]:
# The first Pokemon entry
gen2_list.contents[3]

<tr style="background:#FFF">
<td style="font-family:monospace"> #001
</td>
<td style="font-family:monospace"> #152
</td>
<th> <a href="/wiki/Chikorita_(Pok%C3%A9mon)" title="Chikorita"><img alt="Chikorita" height="40" src="//cdn.bulbagarden.net/upload/4/41/152MS6.png" width="40"/></a>
</th>
<td> <a href="/wiki/Chikorita_(Pok%C3%A9mon)" title="Chikorita (Pokémon)">Chikorita</a>
</td>
<td colspan="2" style="text-align:center; background:#78C850"> <a href="/wiki/Grass_(type)" title="Grass (type)"><span style="color:#FFF">Grass</span></a>
</td></tr>

In [9]:
info_start=3

# Let's figure out how to get each item for Chikorita
info_row=gen2_list.contents[info_start]

for i in range(len(info_row.contents)):
# for i in range(info_start, len(gen1_list.contents), 2):
#     poke_info=gen1_list.contents[i]
#     kdex=poke_info.contents[1].text.strip()
#     ndex=poke_info.contents[3].text.strip()
#     name=poke_info.contents[7].text.strip()
#     type1=poke_info.contents[9].text.strip()
#     if len(poke_info.contents) > 10:
#         type2=poke_info.contents[11].text.strip()
#         print(f'Pokemon {ndex} {name} is a {type1} & {type2} Pokemon')
#     else:
#         print(f'Pokemon {ndex} {name} is a {type1} Pokemon')ents))
    print(f'Index {i} - {info_row.contents[i]}')

Index 0 - 

Index 1 - <td style="font-family:monospace"> #001
</td>
Index 2 - 

Index 3 - <td style="font-family:monospace"> #152
</td>
Index 4 - 

Index 5 - <th> <a href="/wiki/Chikorita_(Pok%C3%A9mon)" title="Chikorita"><img alt="Chikorita" height="40" src="//cdn.bulbagarden.net/upload/4/41/152MS6.png" width="40"/></a>
</th>
Index 6 - 

Index 7 - <td> <a href="/wiki/Chikorita_(Pok%C3%A9mon)" title="Chikorita (Pokémon)">Chikorita</a>
</td>
Index 8 - 

Index 9 - <td colspan="2" style="text-align:center; background:#78C850"> <a href="/wiki/Grass_(type)" title="Grass (type)"><span style="color:#FFF">Grass</span></a>
</td>


In [10]:
# Extract items of interest
kdex=info_row.contents[1].text.strip()
ndex=info_row.contents[3].text.strip()
name=info_row.contents[7].text.strip()
type1=info_row.contents[9].text.strip()
link=info_row.contents[7].a.get('href')

print(f'Pokemon {ndex} {name} is a {type1} Pokemon. Link: https://bulbapedia.bulbagarden.net{link}')

Pokemon #152 Chikorita is a Grass Pokemon. Link: https://bulbapedia.bulbagarden.net/wiki/Chikorita_(Pok%C3%A9mon)


### Get all Gen 2 Pokemons

In [11]:
for i in range(info_start, len(gen2_list.contents), 2):
    poke_info=gen2_list.contents[i]
    kdex=poke_info.contents[1].text.strip()
    ndex=poke_info.contents[3].text.strip()
    name=poke_info.contents[7].text.strip()
    type1=poke_info.contents[9].text.strip()
    link=poke_info.contents[7].a.get('href')
    if len(poke_info.contents) > 10:
        type2=poke_info.contents[11].text.strip()
        print(f'Pokemon {ndex} {name} is a {type1} & {type2} Pokemon, Link: https://bulbapedia.bulbagarden.net{link}')
    else:
        print(f'Pokemon {ndex} {name} is a {type1} Pokemon, Link: https://bulbapedia.bulbagarden.net{link}')

Pokemon #152 Chikorita is a Grass Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Chikorita_(Pok%C3%A9mon)
Pokemon #153 Bayleef is a Grass Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Bayleef_(Pok%C3%A9mon)
Pokemon #154 Meganium is a Grass Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Meganium_(Pok%C3%A9mon)
Pokemon #155 Cyndaquil is a Fire Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Cyndaquil_(Pok%C3%A9mon)
Pokemon #156 Quilava is a Fire Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Quilava_(Pok%C3%A9mon)
Pokemon #157 Typhlosion is a Fire Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Typhlosion_(Pok%C3%A9mon)
Pokemon #158 Totodile is a Water Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Totodile_(Pok%C3%A9mon)
Pokemon #159 Croconaw is a Water Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Croconaw_(Pok%C3%A9mon)
Pokemon #160 Feraligatr is a Water Pokemon, Link: https://bulbapedia.bulbagarden.net/wiki/Feraligatr_(P

### Save them in a JSON

In [12]:
gen2_json = []

for i in range(info_start, len(gen2_list.contents), 2):
    poke_info=gen2_list.contents[i]
    kdex=poke_info.contents[1].text.strip()
    ndex=poke_info.contents[3].text.strip()
    name=poke_info.contents[7].text.strip()
    type1=poke_info.contents[9].text.strip()
    link="https://bulbapedia.bulbagarden.net" + poke_info.contents[7].a.get('href')
    if len(poke_info.contents) > 10:
        type2=poke_info.contents[11].text.strip()
        gen2_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "type2": type2,
            "link":  link
        })
    else:
        gen2_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "link":  link
        })
        
gen2_json

with open('gen2.json', 'w') as f:
    json.dump(gen2_json, f)

In [13]:
gen3_json = []

for i in range(info_start, len(gen3_list.contents), 2):
    poke_info=gen3_list.contents[i]
    kdex=poke_info.contents[1].text.strip()
    ndex=poke_info.contents[3].text.strip()
    name=poke_info.contents[7].text.strip()
    type1=poke_info.contents[9].text.strip()
    link="https://bulbapedia.bulbagarden.net" + poke_info.contents[7].a.get('href')
    if len(poke_info.contents) > 10:
        type2=poke_info.contents[11].text.strip()
        gen3_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "type2": type2,
            "link":  link
        })
    else:
        gen3_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "link":  link
        })
        
gen3_json

with open('gen2.json', 'w') as f:
    json.dump(gen3_json, f)

In [14]:
gen4_json = []

for i in range(info_start, len(gen4_list.contents), 2):
    poke_info=gen4_list.contents[i]
    kdex=poke_info.contents[1].text.strip()
    ndex=poke_info.contents[3].text.strip()
    name=poke_info.contents[7].text.strip()
    type1=poke_info.contents[9].text.strip()
    link="https://bulbapedia.bulbagarden.net" + poke_info.contents[7].a.get('href')
    if len(poke_info.contents) > 10:
        type2=poke_info.contents[11].text.strip()
        gen4_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "type2": type2,
            "link":  link
        })
    else:
        gen4_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "link":  link
        })
        
gen4_json

with open('gen4.json', 'w') as f:
    json.dump(gen4_json, f)

In [15]:
gen5_json = []

for i in range(info_start, len(gen5_list.contents), 2):
    poke_info=gen5_list.contents[i]
    kdex=poke_info.contents[1].text.strip()
    ndex=poke_info.contents[3].text.strip()
    name=poke_info.contents[7].text.strip()
    type1=poke_info.contents[9].text.strip()
    link="https://bulbapedia.bulbagarden.net" + poke_info.contents[7].a.get('href')
    if len(poke_info.contents) > 10:
        type2=poke_info.contents[11].text.strip()
        gen5_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "type2": type2,
            "link":  link
        })
    else:
        gen5_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "link":  link
        })
        
gen5_json

with open('gen5.json', 'w') as f:
    json.dump(gen5_json, f)

In [16]:
gen6_json = []

for i in range(info_start, len(gen6_list.contents), 2):
    poke_info=gen6_list.contents[i]
    kdex=poke_info.contents[1].text.strip()
    ndex=poke_info.contents[3].text.strip()
    name=poke_info.contents[7].text.strip()
    type1=poke_info.contents[9].text.strip()
    link="https://bulbapedia.bulbagarden.net" + poke_info.contents[7].a.get('href')
    if len(poke_info.contents) > 10:
        type2=poke_info.contents[11].text.strip()
        gen6_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "type2": type2,
            "link":  link
        })
    else:
        gen6_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "link":  link
        })
        
gen6_json

with open('gen6.json', 'w') as f:
    json.dump(gen6_json, f)

In [17]:
gen7_json = []

for i in range(info_start, len(gen7_list.contents), 2):
    poke_info=gen7_list.contents[i]
    kdex=poke_info.contents[1].text.strip()
    ndex=poke_info.contents[3].text.strip()
    name=poke_info.contents[7].text.strip()
    type1=poke_info.contents[9].text.strip()
    link="https://bulbapedia.bulbagarden.net" + poke_info.contents[7].a.get('href')
    if len(poke_info.contents) > 10:
        type2=poke_info.contents[11].text.strip()
        gen7_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "type2": type2,
            "link":  link
        })
    else:
        gen7_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "link":  link
        })
        
gen7_json

with open('gen7.json', 'w') as f:
    json.dump(gen7_json, f)

In [18]:
gen8_json = []

for i in range(info_start, len(gen8_list.contents), 2):
    poke_info=gen8_list.contents[i]
    kdex=poke_info.contents[1].text.strip()
    ndex=poke_info.contents[3].text.strip()
    name=poke_info.contents[7].text.strip()
    type1=poke_info.contents[9].text.strip()
    link="https://bulbapedia.bulbagarden.net" + poke_info.contents[7].a.get('href')
    if len(poke_info.contents) > 10:
        type2=poke_info.contents[11].text.strip()
        gen8_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "type2": type2,
            "link":  link
        })
    else:
        gen8_json.append({
            "kdex": kdex,
            "ndex": ndex,
            "name": name,
            "type1": type1,
            "link":  link
        })
        
gen8_json

with open('gen8.json', 'w') as f:
    json.dump(gen8_json, f)