# 01. Import libraries

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Create a path for easier importing
path = r'C:\Users\blim9\Desktop\Projects\Pokemon Analysis'

In [3]:
# Import the dataset to work with
df = pd.read_csv(os.path.join (path, 'datasets', 'Complete Pokemon.csv'))

In [4]:
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


# 02. Data wrangling

In [5]:
df['against_bug'].value_counts()

1.00    376
0.50    247
2.00    128
0.25     42
4.00      8
Name: against_bug, dtype: int64

- 4.0 means **super** effective (that pokemon has 2 types and they're both weak against this type of move, hence it's twice what it normally should be)
- 2.0 means super effective
- 1.0 means normal damage
- 0.5 means not very effective
- 0.25 means **not very** effective (that pokemon has 2 types and they're both resistant to this type of move, hence a quarter of what it normally should be)

In [6]:
df.shape

(801, 41)

In [7]:
df.columns

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'pokedex_number',
       'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object')

In [8]:
# Drop unnecessary columns that won't be used in the analysis
df = df.drop(columns = ['base_egg_steps', 'base_happiness', 'experience_growth', 'japanese_name', 'pokedex_number'])

In [9]:
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,name,percentage_male,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Bulbasaur,88.1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Ivysaur,88.1,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Venusaur,88.1,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,Charmander,88.1,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,Charmeleon,88.1,80,65,80,fire,,19.0,1,0


In [10]:
# Rename against_fight to against_fighting and classfication to classification
df = df.rename(columns = {'against_fight': 'against_fighting', 'classfication': 'classification'})

In [11]:
# Re-order the columns 
df = df.reindex(columns = ['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fighting', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'name', 'attack',
       'defense','hp', 'capture_rate', 'sp_attack', 'sp_defense', 'speed',
       'base_total', 'classification', 'type1', 'type2',  'height_m',
       'weight_kg', 'percentage_male', 'generation', 'is_legendary'])

In [12]:
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fighting,against_fire,against_flying,against_ghost,...,speed,base_total,classification,type1,type2,height_m,weight_kg,percentage_male,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,45,318,Seed Pokémon,grass,poison,0.7,6.9,88.1,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,60,405,Seed Pokémon,grass,poison,1.0,13.0,88.1,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,80,625,Seed Pokémon,grass,poison,2.0,100.0,88.1,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,65,309,Lizard Pokémon,fire,,0.6,8.5,88.1,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,80,405,Flame Pokémon,fire,,1.1,19.0,88.1,1,0


#### Upon a quick skim of the dataset, I noticed that Pokemon is spelt Pokémon with the accent. But when converted to an Excel file, it doesn't read it properly; it becomes PokÃ©mon instead.

In [13]:
# Change all the PokÃ©mon values in classification to Pokémon
df.replace('PokÃ©mon','Pokémon')

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fighting,against_fire,against_flying,against_ghost,...,speed,base_total,classification,type1,type2,height_m,weight_kg,percentage_male,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,45,318,Seed Pokémon,grass,poison,0.7,6.9,88.1,1,0
1,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,60,405,Seed Pokémon,grass,poison,1.0,13.0,88.1,1,0
2,"['Overgrow', 'Chlorophyll']",1.00,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,80,625,Seed Pokémon,grass,poison,2.0,100.0,88.1,1,0
3,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,65,309,Lizard Pokémon,fire,,0.6,8.5,88.1,1,0
4,"['Blaze', 'Solar Power']",0.50,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,80,405,Flame Pokémon,fire,,1.1,19.0,88.1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,['Beast Boost'],0.25,1.0,0.5,2.0,0.5,1.0,2.0,0.5,1.0,...,61,570,Launch Pokémon,steel,flying,9.2,999.9,,7,1
797,['Beast Boost'],1.00,1.0,0.5,0.5,0.5,2.0,4.0,1.0,1.0,...,109,570,Drawn Sword Pokémon,grass,steel,0.3,0.1,,7,1
798,['Beast Boost'],2.00,0.5,2.0,0.5,4.0,2.0,0.5,1.0,0.5,...,43,570,Junkivore Pokémon,dark,dragon,5.5,888.0,,7,1
799,['Prism Armor'],2.00,2.0,1.0,1.0,1.0,0.5,1.0,1.0,2.0,...,79,600,Prism Pokémon,psychic,,2.4,230.0,,7,1


# 03. Data consistency check

In [14]:
# Check for missing values
df.isnull().sum()

abilities             0
against_bug           0
against_dark          0
against_dragon        0
against_electric      0
against_fairy         0
against_fighting      0
against_fire          0
against_flying        0
against_ghost         0
against_grass         0
against_ground        0
against_ice           0
against_normal        0
against_poison        0
against_psychic       0
against_rock          0
against_steel         0
against_water         0
name                  0
attack                0
defense               0
hp                    0
capture_rate          0
sp_attack             0
sp_defense            0
speed                 0
base_total            0
classification        0
type1                 0
type2               384
height_m             20
weight_kg            20
percentage_male      98
generation            0
is_legendary          0
dtype: int64

#### Not all pokemon have two types, so it's not really missing. However, to make it easier, I will replace all the null values with 'none'. 

#### Also, upon researching genders in Pokemon, some don't have a gender, thus they were left as empty values. I will rename them as 'genderless'.

In [15]:
# Replace all null values in type_2 with 'none'
df['type2'].fillna('none', inplace=True)

In [16]:
# Replace all null values in percentage_male with 'genderless'
df['percentage_male'].fillna('genderless', inplace=True)

In [17]:
df.isnull().sum()

abilities            0
against_bug          0
against_dark         0
against_dragon       0
against_electric     0
against_fairy        0
against_fighting     0
against_fire         0
against_flying       0
against_ghost        0
against_grass        0
against_ground       0
against_ice          0
against_normal       0
against_poison       0
against_psychic      0
against_rock         0
against_steel        0
against_water        0
name                 0
attack               0
defense              0
hp                   0
capture_rate         0
sp_attack            0
sp_defense           0
speed                0
base_total           0
classification       0
type1                0
type2                0
height_m            20
weight_kg           20
percentage_male      0
generation           0
is_legendary         0
dtype: int64

#### Height and weight aren't relevant so it's not going to affect my dataset at all to have empty values. 

In [18]:
# Check for duplicate values
df_dups = df[df.duplicated()]

In [19]:
df_dups

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fighting,against_fire,against_flying,against_ghost,...,speed,base_total,classification,type1,type2,height_m,weight_kg,percentage_male,generation,is_legendary


### No duplicates

In [20]:
# Export the cleaned data set for visualization
df.to_csv(os.path.join (path, 'datasets', 'Complete Pokemon (Cleaned).csv'))