In [32]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import requests
from urllib.request import urlopen
import time
from itertools import chain

In [50]:
poke_df = pd.read_pickle('poke_df.pkl')

In [6]:
poke_df.Pokemon.unique()

array(['Bulbasaur', 'Ivysaur', 'Venusaur', ..., 'Iron Valiant',
       'Koraidon', 'Miraidon'], dtype=object)

Check which url's exist and which we need to modify

A lot of these names specify the "form" the pokemon is in, so if it fail and ends in form, let's only try the first word in it.

In [11]:
for pokemon in poke_df.Pokemon.unique():
    response = requests.get(f'https://pokemon.fandom.com/wiki/{pokemon}')
    if response.status_code == 200:
        pass
    else:
        # try just the first word
        if requests.get(
            f'https://pokemon.fandom.com/wiki/{pokemon.split()[0]}'
            ).status_code != 200:
            print(f"URL doesn't match for {pokemon} or its first word") 


URL doesn't match for Ho-oh or its first word


There is only one URL we don't like there, and that's for Ho-oh. The correct URL is "Ho-Oh", so let's just fix that.

In [52]:

poke_df['url_poke'] = np.where(
    poke_df['Pokemon'] == 'Ho-oh', 'Ho-Oh', poke_df['Pokemon'])


Just to check here...

In [16]:
set(poke_df['url_poke']).difference(set(poke_df['Pokemon']))

{'Ho-Oh'}

Create dataframes

In [98]:
behavior_df = pd.DataFrame()
physiology_df = pd.DataFrame()
failed_pokemon = []

for pokemon in poke_df.url_poke.unique():

    url = f'https://pokemon.fandom.com/wiki/{pokemon}'

    # if we only want the first word of the pokemon make it so
    if requests.get(url).status_code == 200:
        url = f'https://pokemon.fandom.com/wiki/{pokemon.split()[0]}'

    try: 
        soup = BeautifulSoup(urlopen(
        url.encode('ascii', 'ignore').decode('utf-8')
        ).read(),'lxml')


        # first, the behavior
        temp_behavior_df = pd.DataFrame()
        poketext_behavior = []
        for tag in soup.find_all('h3')[1:]:
            if tag.text.strip() == 'Behavior':
                for item in tag.find_next_siblings('p'):
                    if 'Behavior' in item.find_previous_siblings('h3')[0].text.strip():
                        poketext_behavior.append(item.text.strip())
        if poketext_behavior:                
        # we want a new entry for each sentence.
            poketext_behavior = list(filter(None, list(chain(*[text.split('.') for text in poketext_behavior]))))
            temp_behavior_df['Pokemon'] = [pokemon] * len(poketext_behavior)
            temp_behavior_df['Behavior'] = poketext_behavior

            behavior_df = pd.concat([behavior_df,temp_behavior_df])
        else:
            print(f'no text found for behavior for {pokemon}')
            
        # now, the physiology
        temp_physiology_df = pd.DataFrame()
        poketext_physiology = []
        for tag in soup.find_all('h3')[1:]:
            if tag.text.strip() == 'Physiology':
                for item in tag.find_next_siblings('p'):
                    if 'Physiology' in item.find_previous_siblings('h3')[0].text.strip():
                        poketext_physiology.append(item.text.strip())
        if poketext_physiology:                
        # we want a new entry for each sentence.
            poketext_physiology = list(filter(None, list(chain(*[text.split('.') for text in poketext_physiology]))))
            temp_physiology_df['Pokemon'] = [pokemon] * len(poketext_physiology)
            temp_physiology_df['physiology'] = poketext_physiology

            physiology_df = pd.concat([physiology_df,temp_physiology_df])
        else:
            print(f'no text found for physiology for {pokemon}')
        # give a second break so the wiki fandom doesn't get sus
        
    except:
        failed_pokemon.append(pokemon)
        f'Found an ERROR for {pokemon}. Added to list'   

no text found for behavior for Squirtle
no text found for behavior for Rattata
no text found for behavior for Nidoran♀
no text found for physiology for Nidoran♀
no text found for behavior for Nidoran♂
no text found for physiology for Nidoran♂
no text found for behavior for Zubat
no text found for behavior for Gloom
no text found for behavior for Venonat
no text found for behavior for Venomoth
no text found for behavior for Poliwag
no text found for behavior for Bellsprout
no text found for behavior for Weepinbell
no text found for behavior for Victreebel
no text found for behavior for Tentacruel
no text found for behavior for Graveler
no text found for behavior for Voltorb
no text found for behavior for Cubone
no text found for behavior for Koffing
no text found for behavior for Rhydon
no text found for behavior for Goldeen
no text found for behavior for Seaking
no text found for behavior for Magmar
no text found for physiology for Ditto
no text found for behavior for Eevee
no text fou

In [127]:
behavior_df

Unnamed: 0,Pokemon,Behavior
0,Bulbasaur,A Bulbasaur often rests in bright places so it...
1,Bulbasaur,It can be seen napping in bright sunlight
2,Bulbasaur,"While it sleeps, the seed on its back catches..."
3,Bulbasaur,"In the wild, Bulbasaur tend to be very rare, a..."
4,Bulbasaur,"However, they are also generally very docile,..."
...,...,...
0,Baxcalibur,"Best seen when using its signature move, Glaiv..."
1,Baxcalibur,It then finishes off its opponent with a sing...
0,Gholdengo,Gholdengo has a friendly disposition and can b...
0,Chien-Pao,Chien-Pao is described as being able to contro...


Now, let's drop the scraped parts that don't make any sense.

In [136]:
display(physiology_df[(physiology_df.physiology.str.len() < 10)])
display(behavior_df[(behavior_df.Behavior.str.len() < 20)])

physiology_df = physiology_df[~(physiology_df.physiology.str.len() < 10)]
behavior_df = behavior_df[~(behavior_df.Behavior.str.len() < 20)]

Unnamed: 0,Pokemon,physiology
4,Staryu,""""
5,Lanturn,""""
9,Qwilfish,""""
5,Exploud,)
4,Cradily,﻿
8,Latias,[1]
1,Magnezone,F
2,Magnezone,O
2,Phione,8lbs
8,Panpour,﻿


Unnamed: 0,Pokemon,Behavior
12,Raticate,Their webbed feet
6,Magneton,radius
2,Dratini,5 feet (2 meters)
3,Ampharos,On the other hand
8,Murkrow,”
1,Qwilfish,6 gallons/approx
5,Delibird,Everest
6,Swampert,waters
4,Clamperl,”
1,Infernape,It never gives up


In [137]:
behavior_df.to_csv('behavior_df.csv')
physiology_df.to_csv('physiology_df.csv')