# Data preparation

Read in the provided data, clean up where appropriate and do feature extraction.

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('data/pokemon.csv')

In [3]:
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


In [4]:
df.columns

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'pokedex_number',
       'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 41 columns):
abilities            801 non-null object
against_bug          801 non-null float64
against_dark         801 non-null float64
against_dragon       801 non-null float64
against_electric     801 non-null float64
against_fairy        801 non-null float64
against_fight        801 non-null float64
against_fire         801 non-null float64
against_flying       801 non-null float64
against_ghost        801 non-null float64
against_grass        801 non-null float64
against_ground       801 non-null float64
against_ice          801 non-null float64
against_normal       801 non-null float64
against_poison       801 non-null float64
against_psychic      801 non-null float64
against_rock         801 non-null float64
against_steel        801 non-null float64
against_water        801 non-null float64
attack               801 non-null int64
base_egg_steps       801 non-null int64
base_happiness    

Work through the columns, converting them to a format that we can use (numeric)

In [6]:
df.drop(['japanese_name'], axis=1, inplace=True)

In [7]:
df['abilities'].head()

0    ['Overgrow', 'Chlorophyll']
1    ['Overgrow', 'Chlorophyll']
2    ['Overgrow', 'Chlorophyll']
3       ['Blaze', 'Solar Power']
4       ['Blaze', 'Solar Power']
Name: abilities, dtype: object

The `abilities` column _looks_ like a list, but it's actually a string representation of a list.  We can use the `ast` module to evaluate the string to a list, and then join the elements into a comma separated string.

In [8]:
def abilities_to_list(row):
    l = ast.literal_eval(row['abilities'])
    return ','.join(l)

In [9]:
df['abilities_list'] = df.apply(abilities_to_list, axis=1)

Now that the column contains comma separated values, we can use `get_dummies` to convert the individual elements within the string into one hot encoded columns, store them in their own dataframe and drop the columns now that we're done with them.

In [10]:
df_abilities = df['abilities_list'].str.get_dummies(sep=',')
df.drop(['abilities', 'abilities_list'], axis=1, inplace=True)

A lot of the columns begin with `against_` - let's split them off into their own dataframe and drop them from the main dataframe.

In [11]:
against_cols = [c for c in df.columns if c.startswith('against_')]

In [12]:
df_against = df[against_cols]
df.drop(against_cols, axis=1, inplace=True)

The `classfication` column contains comma separated values.  We can just `get_dummies` on them.

In [13]:
df_classfication= pd.get_dummies(df['classfication'])
df.drop(['classfication'], axis=1, inplace=True)

Each of the Pokémon have 1, sometimes 2 types, each in their own column.  The function below comma separates the types that exist (for rows that have only 1 type it returns just that type, for rows that have  two types it returns them comma separated).  Once we have them in that format we can use `get_dummies` again.

In [14]:
def buildtype(row):
     return ','.join([t for t in [row['type1'], row['type2']] if t])

In [15]:
df['type_list'] = df.fillna('').apply(buildtype, axis=1)

In [16]:
df_types = df['type_list'].str.get_dummies(sep=',')
df.drop(['type1', 'type2', 'type_list'], axis=1, inplace=True)

The capture rate appears to be numeric, but in some instances there are two annotated rates provided (`'30 (Meteorite)255 (Core)'`) - in this case we'll just grab the first one and convert the values to integers.

In [17]:
def buildcapture(row):
    return int(row['capture_rate'].split(' ')[0])

df['capture_rate'] = df.apply(buildcapture, axis=1)

Split the names and the pokedex number off into their own dataframe.

In [18]:
df_names = df[['name', 'pokedex_number']]
df.drop(['name', 'pokedex_number'], axis=1, inplace=True)

In [19]:
df.drop(['percentage_male'], axis=1, inplace=True)

In [20]:
df = df.fillna(0)

The original dataframe we loaded now only contains the columns with numeric data

In [21]:
df.head()

Unnamed: 0,attack,base_egg_steps,base_happiness,base_total,capture_rate,defense,experience_growth,height_m,hp,sp_attack,sp_defense,speed,weight_kg,generation,is_legendary
0,49,5120,70,318,45,49,1059860,0.7,45,65,65,45,6.9,1,0
1,62,5120,70,405,45,63,1059860,1.0,60,80,80,60,13.0,1,0
2,100,5120,70,625,45,123,1059860,2.0,80,122,120,80,100.0,1,0
3,52,5120,70,309,45,43,1059860,0.6,39,60,50,65,8.5,1,0
4,64,5120,70,405,45,58,1059860,1.1,58,80,65,80,19.0,1,0


`df_abilities` contains the one hot encoded abilies

In [22]:
df_abilities.head()

Unnamed: 0,Adaptability,Aftermath,Air Lock,Analytic,Anger Point,Anticipation,Arena Trap,Aroma Veil,Aura Break,Bad Dreams,...,Water Absorb,Water Bubble,Water Compaction,Water Veil,Weak Armor,White Smoke,Wimp Out,Wonder Guard,Wonder Skin,Zen Mode
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


`df_against` are all the against column values

In [23]:
df_against.head()

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,against_ground,against_ice,against_normal,against_poison,against_psychic,against_rock,against_steel,against_water
0,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.5
1,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.5
2,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.5
3,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,2.0,0.5,1.0,1.0,1.0,2.0,0.5,2.0
4,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,2.0,0.5,1.0,1.0,1.0,2.0,0.5,2.0


`df_classfication` contains the one hot encoded values from the classfication column

In [24]:
df_classfication.head()

Unnamed: 0,Abundance Pokémon,Acorn Pokémon,Alpha Pokémon,Angler Pokémon,Ant Pit Pokémon,Anteater Pokémon,Antenna Pokémon,Aquamouse Pokémon,Aquarabbit Pokémon,Arm Thrust Pokémon,...,Wish Pokémon,Wolf Pokémon,Wood Gecko Pokémon,Woodpecker Pokémon,Wool Pokémon,Woolly Crab Pokémon,Worm Pokémon,Wrestling Pokémon,Young Fowl Pokémon,Zen Charm Pokémon
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


`df_types` are the one hot encodings of the two types we joined together

In [25]:
df_types.head()

Unnamed: 0,bug,dark,dragon,electric,fairy,fighting,fire,flying,ghost,grass,ground,ice,normal,poison,psychic,rock,steel,water
0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


`df_names` contains the English Pokémon name and pokedex number

In [26]:
df_names.head()

Unnamed: 0,name,pokedex_number
0,Bulbasaur,1
1,Ivysaur,2
2,Venusaur,3
3,Charmander,4
4,Charmeleon,5


Because some of the numeric values are quite large, we should get them all within a similar range.  We use min max scaling to bring them all within the 0 to 1 range.  The original dataframe contains useful columns to use for filtering later one, so we will create the scaled version in a new dataframe to keep the original values.

In [27]:
scaler = MinMaxScaler()

In [28]:
df_against.head()

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,against_ground,against_ice,against_normal,against_poison,against_psychic,against_rock,against_steel,against_water
0,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.5
1,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.5
2,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.5
3,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,2.0,0.5,1.0,1.0,1.0,2.0,0.5,2.0
4,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,2.0,0.5,1.0,1.0,1.0,2.0,0.5,2.0


In [29]:
df_numeric_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [30]:
df_against = pd.DataFrame(scaler.fit_transform(df_against), columns=df_against.columns)

Save all the dataframes for later use.

In [31]:
df.to_pickle('data/dataframes/pokemon_numeric.pickle')
df_numeric_scaled.to_pickle('data/dataframes/pokemon_numeric_scaled.pickle')
df_against.to_pickle('data/dataframes/pokemon_against.pickle')
df_names.to_pickle('data/dataframes/pokemon_names.pickle')
df_types.to_pickle('data/dataframes/pokemon_types.pickle')
df_classfication.to_pickle('data/dataframes/pokemon_classfication.pickle')
df_abilities.to_pickle('data/dataframes/pokemon_abilities.pickle')