In [2]:
import pandas as pd
import requests
import io

### Fetching the data

In [3]:
urls = ["https://raw.githubusercontent.com/beatrizalbiac/Ing-Datos-datasets/refs/heads/main/global-project/pokemon-data.csv",
       "https://raw.githubusercontent.com/beatrizalbiac/Ing-Datos-datasets/refs/heads/main/global-project/move-data.csv"]

# this is so they're each saved in their corresponding dataframe, and not just overwrite one df
df_pokemon = None
df_moves = None

for i in urls:
    try:
        print(f"Fetching data from: {i}")
        response = requests.get(i, timeout=10)
    
        print("Data fetched from web source, loading into DataFrame...")

        if "pokemon-data.csv" in i:
            sep = ';'
        elif "move-data.csv" in i:
            sep = ','
        
        df = pd.read_csv(io.StringIO(response.text), sep=sep, on_bad_lines='warn')

        if "pokemon-data.csv" in i:
            df_pokemon = df
        elif "move-data.csv" in i:
            df_moves = df
    
        print(f"retrieval succesfull")
        print(f"status code:{response.status_code}\n")
   
    except Exception as e:
        print(f"error: {e}")
        raise e

Fetching data from: https://raw.githubusercontent.com/beatrizalbiac/Ing-Datos-datasets/refs/heads/main/global-project/pokemon-data.csv
Data fetched from web source, loading into DataFrame...
retrieval succesfull
status code:200

Fetching data from: https://raw.githubusercontent.com/beatrizalbiac/Ing-Datos-datasets/refs/heads/main/global-project/move-data.csv
Data fetched from web source, loading into DataFrame...
retrieval succesfull
status code:200



### Initial db exploration

In [4]:
df_pokemon.head()

Unnamed: 0,Name,Types,Abilities,Tier,HP,Attack,Defense,Special Attack,Special Defense,Speed,Next Evolution(s),Moves
0,Abomasnow,"['Grass', 'Ice']","['Snow Warning', 'Soundproof']",PU,90,92,75,92,85,60,[],"['Ice Punch', 'Powder Snow', 'Leer', 'Razor Le..."
1,Abomasnow-Mega,"['Grass', 'Ice']",['Snow Warning'],NUBL,90,132,105,132,105,30,[],"['Ice Punch', 'Powder Snow', 'Leer', 'Razor Le..."
2,Abra,['Psychic'],"['Inner Focus', 'Magic Guard', 'Synchronize']",LC,25,20,15,105,55,90,['Kadabra'],"['Teleport', 'Ally Switch', 'Barrier', 'Encore..."
3,Absol,['Dark'],"['Justified', 'Pressure', 'Super Luck']",PU,65,130,60,75,60,75,[],"['Perish Song', 'Future Sight', 'Me First', 'R..."
4,Absol-Mega,['Dark'],['Magic Bounce'],RUBL,65,150,60,115,60,115,[],"['Perish Song', 'Future Sight', 'Me First', 'R..."


In [5]:
df_moves.head()


Unnamed: 0,Index,Name,Type,Category,Contest,PP,Power,Accuracy,Generation
0,1,Pound,Normal,Physical,Tough,35,40.0,100.0,1
1,2,Karate Chop,Fighting,Physical,Tough,25,50.0,100.0,1
2,3,Double Slap,Normal,Physical,Cute,10,15.0,85.0,1
3,4,Comet Punch,Normal,Physical,Tough,15,18.0,85.0,1
4,5,Mega Punch,Normal,Physical,Tough,20,80.0,85.0,1


In [6]:
df_pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               918 non-null    object
 1   Types              918 non-null    object
 2   Abilities          918 non-null    object
 3   Tier               820 non-null    object
 4   HP                 918 non-null    int64 
 5   Attack             918 non-null    int64 
 6   Defense            918 non-null    int64 
 7   Special Attack     918 non-null    int64 
 8   Special Defense    918 non-null    int64 
 9   Speed              918 non-null    int64 
 10  Next Evolution(s)  918 non-null    object
 11  Moves              918 non-null    object
dtypes: int64(6), object(6)
memory usage: 86.2+ KB


In [7]:
df_moves.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 728 entries, 0 to 727
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Index       728 non-null    int64  
 1   Name        728 non-null    object 
 2   Type        728 non-null    object 
 3   Category    728 non-null    object 
 4   Contest     728 non-null    object 
 5   PP          728 non-null    int64  
 6   Power       414 non-null    float64
 7   Accuracy    448 non-null    float64
 8   Generation  728 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 51.3+ KB


In [8]:
df_pokemon.isnull().sum()

Name                  0
Types                 0
Abilities             0
Tier                 98
HP                    0
Attack                0
Defense               0
Special Attack        0
Special Defense       0
Speed                 0
Next Evolution(s)     0
Moves                 0
dtype: int64

In [9]:
df_moves.isnull().sum()

Index           0
Name            0
Type            0
Category        0
Contest         0
PP              0
Power         314
Accuracy      280
Generation      0
dtype: int64

I'm going to parse the lists (things like ["Grass, Ice"]) into actual lists to make the analysis easier

In [10]:
import ast

df_pokemon['Types_parsed'] = df_pokemon['Types'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_pokemon['Abilities_parsed'] = df_pokemon['Abilities'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_pokemon['Moves_parsed'] = df_pokemon['Moves'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_pokemon['Next_Evolution_parsed'] = df_pokemon['Next Evolution(s)'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

print("Succesful")

Succesful


In [11]:
print(f"Duplicate pokemon: {df_pokemon['Name'].duplicated().sum()}")
print(f"Duplicate Moves: {df_moves['Name'].duplicated().sum()}")

Duplicate pokemon: 0
Duplicate Moves: 18


In [12]:
df_pokemon['num_types'] = df_pokemon['Types_parsed'].apply(len)
print("Types per pokemon:")
print(df_pokemon['num_types'].value_counts()) # it refers to how many pokemons are dual type and how many are single type

Types per pokemon:
num_types
2    494
1    424
Name: count, dtype: int64


In [13]:
df_pokemon['num_moves'] = df_pokemon['Moves_parsed'].apply(len)
print(f"Average moves per pokemon: {df_pokemon['num_moves'].mean():.0f}")

Average moves per pokemon: 97


I have to check if all the moves that appear on the pokemon db correspond to a move in the move db so I can connect them further down the line

In [14]:
allmoves = set()
for moves in df_pokemon['Moves_parsed']:
    allmoves.update(moves)

# Get all moves from moves dataset
names = set(df_moves['Name'].unique())

# Find mismatches
missing = allmoves - names
print(f"Moves missing: {len(missing)}")

Moves missing: 15


### Fixing the data

In [15]:
print("Missing moves:")
print(list(missing)[:15])

Missing moves:
["Lock'On", "Wake'Up Slap", "Double'Edge", "Will'O'Wisp", "Mud'Slap", "Soft'Boiled", "Self'Destruct", "U'turn", "Trick'or'Treat", "Topsy'Turvy", "Power'Up Punch", "Multi'Attack", "Freeze'Dry", "X'Scissor", "Baby'Doll Eyes"]


I found out by manually checking that they do exist they're just written with a - instead of a '

In [16]:
df_pokemon['Moves_parsed'] = df_pokemon['Moves_parsed'].apply(
    lambda moves: [move.replace("'", "-") for move in moves]
)

In [17]:
allmoves = set()
for moves in df_pokemon['Moves_parsed']:
    allmoves.update(moves)

names = set(df_moves['Name'].unique())

missing = allmoves - names
print(f"Missing moves: {len(missing)}")

Missing moves: 4


In [18]:
print("Missing moves:")
print(list(missing)[:4])

Missing moves:
['King-s Shield', 'Nature-s Madness', 'Forest-s Curse', 'Land-s Wrath']


I'll just fix these 4 as it's shorter than the 15. I could just change mannually the first 15 though

In [19]:
fixes = {
    "Forest-s Curse": "Forest's Curse",
    "King-s Shield": "King's Shield",
    "Land-s Wrath": "Land's Wrath",
    "Nature-s Madness": "Nature's Madness"
}

df_pokemon['Moves_parsed'] = df_pokemon['Moves_parsed'].apply(
    lambda moves: [fixes.get(move, move) for move in moves]
)

In [20]:
all_pokemon_moves = set()
for moves in df_pokemon['Moves_parsed']:
    all_pokemon_moves.update(moves)

missing = all_pokemon_moves - set(df_moves['Name'].unique())
print(f"Moves still missing: {len(missing)}")

Moves still missing: 0


In [21]:
duplicates = df_moves[df_moves['Name'].duplicated(keep=False)].sort_values('Name')
print(duplicates[['Name', 'Type', 'Category', 'Power', 'Generation']])

                       Name      Type  Category  Power  Generation
627           Acid Downpour    Poison  Physical    NaN           7
628           Acid Downpour    Poison   Special    NaN           7
623       All-Out Pummeling  Fighting  Physical    NaN           7
624       All-Out Pummeling  Fighting   Special    NaN           7
654      Black Hole Eclipse      Dark   Special    NaN           7
653      Black Hole Eclipse      Dark  Physical    NaN           7
644              Bloom Doom     Grass   Special    NaN           7
643              Bloom Doom     Grass  Physical    NaN           7
621         Breakneck Blitz    Normal  Physical    NaN           7
622         Breakneck Blitz    Normal   Special    NaN           7
631       Continental Crush      Rock  Physical    NaN           7
632       Continental Crush      Rock   Special    NaN           7
637         Corkscrew Crash     Steel  Physical    NaN           7
638         Corkscrew Crash     Steel   Special    NaN        

They're just some weird thing that pokemon does on gen 7 where the category of the move depends on the base move. They're called Z-moves.

It doesn't have an impact on anything so they're fine like that

In [22]:
df_moves["Category"].value_counts()

Category
Physical    288
Status      240
Special     200
Name: count, dtype: int64

In [24]:
df_moves["Accuracy"].value_counts()

Accuracy
100.0    320
90.0      46
95.0      29
85.0      26
75.0      10
80.0       7
70.0       4
55.0       3
50.0       3
Name: count, dtype: int64