In [1]:
from pokemon_utilities import PokemonCard
import pandas as pd
import os

# Functions

In [3]:
# Function to add a row to a DataFrame from card.stats and account for new keys as new columns in the DataFrame
def add_row_to_dataframe(stats, df=None): #, df=None):
    # Create a DataFrame if it doesn't exist
    if df is None:
        df = pd.DataFrame(columns=stats.keys())
    
    # Check for new keys and update DataFrame columns
    new_keys = set(stats.keys()) - set(df.columns)
    if new_keys:
        for key in new_keys:
            df[key] = None  # Add new columns with None as default values
    
    # Convert the stats dictionary to a DataFrame
    new_row = pd.DataFrame([stats])
    
    # Append the new row to the DataFrame
    df = pd.concat([df, new_row], ignore_index=True)
    
    return df

# Extraction

In [4]:
folder_path = "/Portfolio-Projects/02 Data Pipelines/Pokemon-Card-Web-Scraping/card-html-files"
problem_cards_folder = "Portfolio-Projects/02 Data Pipelines/Pokemon-Card-Web-Scraping/problem-card-html-files"

# Create empty dataframe
df = pd.DataFrame()

# Walk folder and load HTML files
for root, dirs, files in os.walk(folder_path):
    files = sorted(files, key=str.lower)  # Sort files alphabetically, case-insensitive
    for file_name in files:
        if file_name.endswith(".html"):
            file_path = os.path.join(root, file_name)
            # Get card name from file name
            card_name = os.path.splitext(file_name)[0].split(" - ")[-1]
            # print(card_name)
            # Get card number from file
            card_number = int(os.path.splitext(file_name)[0].split(" - ")[1])
            with open(file_path, "r", encoding="utf-8") as file:
                html_content = file.read()

                # Attempt to extract info from card
                try:
                    # Create PokemonCard object
                    card = PokemonCard(html_content)

                    # Add card name and number from html file
                    card.stats["name"] = card_name
                    card.stats["card_number"] = card_number

                    # Add card stats to DataFrame
                    df = add_row_to_dataframe(card.stats, df)


                except Exception as e:
                    print(f"Error extracting card info: {e}")
                    # Move file to problem_cards_folder
                    problem_file_path = os.path.join(problem_cards_folder, file_name)
                    os.rename(file_path, problem_file_path)
                    continue

Error extracting card info: 'NoneType' object has no attribute 'strip'
Error extracting card info: 'NoneType' object has no attribute 'strip'
Error extracting card info: 'NoneType' object has no attribute 'strip'
Error extracting card info: list index out of range


# Transform

In [5]:
# Reorder columns to dsplay in a more logical order
columns_ordered = [
    "name",
    "type",
    "description",
    "color",
    "hp",
    "attacks_name",
    "attacks_cost",
    "attacks_damage",
    "attacks_description",
    "abilities_name",
    "abilities_description",
    "retreat",
    "weakness",
    "rarity",
    "stage",
    "evolves_from",
    "illustrator",
    "setId",
    "card_number",
]

# Reorder columns
df = df[columns_ordered]

# Sort rows by card set and number
df_sorted = df.sort_values(by=['setId', 'card_number'], ascending=[True, True], ignore_index=True)
df_sorted

Unnamed: 0,name,type,description,color,hp,attacks_name,attacks_cost,attacks_damage,attacks_description,abilities_name,abilities_description,retreat,weakness,rarity,stage,evolves_from,illustrator,setId,card_number
0,Bulbasaur,Pokemon,,Grass,70,{0: 'Vine Whip'},"{0: ['grass', 'colorless']}",{0: 40},{0: ''},,,1,Fire,Common,Basic,,Narumi Sato,A1,1
1,Ivysaur,Pokemon,,Grass,90,{0: 'Razor Leaf'},"{0: ['grass', 'colorless', 'colorless']}",{0: 60},{0: ''},,,2,Fire,Uncommon,Stage 1,Bulbasaur,Kurata So,A1,2
2,Venusaur,Pokemon,,Grass,160,{0: 'Mega Drain'},"{0: ['grass', 'grass', 'colorless', 'colorless']}",{0: 80},{0: 'Heal 30 damage from this Pokémon.'},,,3,Fire,Rare,Stage 2,Ivysaur,Ryota Murayama,A1,3
3,Venusaur ex,Pokemon,,Grass,190,"{0: 'Razor Leaf', 1: 'Giant Bloom'}","{0: ['grass', 'colorless', 'colorless'], 1: ['...","{0: 60, 1: 100}","{0: '', 1: 'Heal 30 damage from this Pokémon.'}",,,3,Fire,Double Rare,Stage 2,Ivysaur,PLANETA CG Works,A1,4
4,Caterpie,Pokemon,,Grass,50,{},"{0: ['colorless', 'grass']}",{},{0: 'Put 1 random Pokémon from your deck into...,,,1,Fire,Common,Basic,,Miki Tanaka,A1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,Blitzle,,,,,,,,,,,,,,,,,,105
383,Zebstrika,,,,,,,,,,,,,,,,,,106
384,Mr. Mime,,,,,,,,,,,,,,,,,,126
385,Nidoran♀,,,,,,,,,,,,,,,,,,166


In [6]:
# process columns with dictionaries, expanding 
columns_to_process = [
    "attacks_name", 
    "attacks_cost", 
    "attacks_damage", 
    "attacks_description", 
    "abilities_name", 
    "abilities_description",
    ]

for column in columns_to_process:
    # expand dictionary into columns
    expanded = df_sorted[column].apply(pd.Series)

    # rename columns to include original column name
    expanded.columns = [f"{column}_{i+1}" for i in range(expanded.shape[1])]

    # Find the position of the original column
    col_idx = df_sorted.columns.get_loc(column)

    # Add expanded columns to original DataFrame at the correct position
    for i, new_col in enumerate(expanded.columns):
        df_sorted.insert(col_idx + i + 1, new_col, expanded[new_col])

    # drop original column
    df_sorted.drop(column, axis=1, inplace=True)

df_sorted

Unnamed: 0,name,type,description,color,hp,attacks_name_1,attacks_name_2,attacks_cost_1,attacks_cost_2,attacks_damage_1,...,abilities_name_1,abilities_description_1,retreat,weakness,rarity,stage,evolves_from,illustrator,setId,card_number
0,Bulbasaur,Pokemon,,Grass,70,Vine Whip,,"[grass, colorless]",,40.0,...,,,1,Fire,Common,Basic,,Narumi Sato,A1,1
1,Ivysaur,Pokemon,,Grass,90,Razor Leaf,,"[grass, colorless, colorless]",,60.0,...,,,2,Fire,Uncommon,Stage 1,Bulbasaur,Kurata So,A1,2
2,Venusaur,Pokemon,,Grass,160,Mega Drain,,"[grass, grass, colorless, colorless]",,80.0,...,,,3,Fire,Rare,Stage 2,Ivysaur,Ryota Murayama,A1,3
3,Venusaur ex,Pokemon,,Grass,190,Razor Leaf,Giant Bloom,"[grass, colorless, colorless]","[grass, grass, colorless, colorless]",60.0,...,,,3,Fire,Double Rare,Stage 2,Ivysaur,PLANETA CG Works,A1,4
4,Caterpie,Pokemon,,Grass,50,,,"[colorless, grass]",,,...,,,1,Fire,Common,Basic,,Miki Tanaka,A1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,Blitzle,,,,,,,,,,...,,,,,,,,,,105
383,Zebstrika,,,,,,,,,,...,,,,,,,,,,106
384,Mr. Mime,,,,,,,,,,...,,,,,,,,,,126
385,Nidoran♀,,,,,,,,,,...,,,,,,,,,,166


In [7]:
# Add column of urls
df_sorted["url"] = "https://ptcgpocket.gg/cards/" + df_sorted["setId"] + "-" + df_sorted["card_number"].astype(str) + "-" + df_sorted["name"].str.replace(" ", "")

# lowercase all the strings in df_sorted["url"]
df_sorted["url"] = df_sorted["url"].str.lower()

# Load

In [9]:
# Export to CSV
df_sorted.to_csv("pokemon_cards.csv", index=False)