In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import sys

# Helper function to clean ATK/DEF
def clean_atk_def(value_text):
    """Cleans Attack/Defense values. Returns integer or None."""
    if value_text is None:
        return None
    cleaned_text = value_text.strip()
    if cleaned_text.isdigit():
        return int(cleaned_text)
    elif cleaned_text == '?' or cleaned_text == '-' or cleaned_text == '':
         return None # Represent unknown/non-applicable as None
    else:
        # Try to find digits anyway, sometimes there's extra text
        digits = re.findall(r'\d+', cleaned_text)
        if digits:
            return int(digits[0])
        return None # If truly non-numeric and not '?' or '-'

# Helper function to clean Type text
def clean_type_text(type_text):
    """Removes surrounding brackets and extra whitespace from Type."""
    if type_text is None:
        return None
    # Remove brackets and strip whitespace
    cleaned = type_text.replace('[', '').replace(']', '').strip()
    cleaned = re.sub(r'\s+/\s+', '/', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned

# Configuration
BASE_URL_STRUCTURE = "https://www.db.yugioh-card.com/yugiohdb/card_search.action?ope=1&sess=1&rp=100&mode=1&sort=1&keyword=&stype=1&ctype=1&othercon=2&starfr=&starto=&pscalefr=&pscaleto=&linkmarkerfr=&linkmarkerto=&link_m=2&atkfr=&atkto=&deffr=&defto=&releaseDStart=1&releaseMStart=1&releaseYStart=1999&releaseDEnd=&releaseMEnd=&releaseYEnd="
TOTAL_PAGES = 85
all_card_data = []
REQUEST_DELAY = 0.5

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

print("Starting scraping process...")

# --- Main Scraping Loop ---
try:
    for page_num in range(1, TOTAL_PAGES + 1):
        page_url = f"{BASE_URL_STRUCTURE}&page={page_num}"
        if "mode=" not in page_url:
             page_url += "&mode=1"
        elif "mode=&" in page_url:
             page_url = page_url.replace("mode=&", "mode=1&")


        print(f"\n--- Processing Page {page_num}/{TOTAL_PAGES} ---")
        print(f"URL: {page_url}")
        sys.stdout.flush()

        try:
            response = requests.get(page_url, headers=headers, timeout=20)
            response.raise_for_status()

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page_num}: {e}. Skipping page.")
            time.sleep(REQUEST_DELAY)
            continue

        soup = BeautifulSoup(response.text, "html.parser")

        cards = soup.select("#card_list .t_row:not(.t_header)")

        if not cards:
            print(f"Warning: No cards found on page {page_num} using selector '#card_list .t_row'. Check if page structure changed or end of results.")
            card_list_div = soup.select_one("#card_list")
            if not card_list_div:
                print("Error: '#card_list' container not found either.")
            # If it's page 1 and no cards, something is wrong with the URL/selectors
            if page_num == 1:
                print("Error: No cards found on the first page. Please double-check BASE_URL_STRUCTURE and selectors.")
                break # Stop if the first page is empty
            time.sleep(REQUEST_DELAY)
            continue # Try the next page otherwise

        print(f"Found {len(cards)} card entries on page {page_num}.")

        # Loop through each card found on the page
        for card in cards:
            card_data = {}

            # Extract data using the selectors
            name_tag = card.select_one(".card_name")
            attr_tag = card.select_one(".box_card_attribute span")
            level_tag = card.select_one(".box_card_level_rank span")
            atk_tag = card.select_one(".atk_power span")
            def_tag = card.select_one(".def_power span")
            desc_tag = card.select_one(".box_card_text")

            type_tag = card.select_one(".card_info_species_and_other_item")

            # Populate the dictionary
            card_data['Name'] = name_tag.text.strip() if name_tag else None
            card_data['Attribute'] = attr_tag.text.strip() if attr_tag else None

            # Level/Rank
            level_text = level_tag.text.strip() if level_tag else None
            if level_text and level_text.isdigit():
                 card_data['Level'] = int(level_text)
            else:
                 card_data['Level'] = level_text

            # ATK/DEF
            card_data['ATK'] = clean_atk_def(atk_tag.text) if atk_tag else None
            card_data['DEF'] = clean_atk_def(def_tag.text) if def_tag else None

            # Description
            card_data['Description'] = desc_tag.text.strip() if desc_tag else None

            # Type (Cleaned)
            card_data['Type'] = clean_type_text(type_tag.text) if type_tag else None

            # Add the extracted data to our main list
            if card_data['Name']:
                 all_card_data.append(card_data)

        time.sleep(REQUEST_DELAY)

except KeyboardInterrupt:
    print("\nScraping interrupted by user.")

finally:
    # Process Data
    print("\n--- Scraping Finished (or Interrupted) ---")

    if all_card_data:
        df = pd.DataFrame(all_card_data)

        cols_order = ['Name', 'Attribute', 'Level', 'Type', 'ATK', 'DEF', 'Description']
        for col in cols_order:
            if col not in df.columns:
                df[col] = None
        df = df[cols_order]


        print(f"\nSuccessfully scraped {len(df)} cards.")
        print("DataFrame Info:")
        df.info()
        print("\nDataFrame Head (showing Type column):")
        print(df.head())
        print("\nCheck a few rows for Type data:")
        print(df[['Name', 'Type']].head(10))


        # Save to CSV
        csv_path_local = 'yugioh_monster_cards.csv'
        try:
            df.to_csv(csv_path_local, index=False)
            print(f"\nDataFrame saved locally to {csv_path_local}")
            print("You can download this file from the Colab file browser (folder icon on the left).")
        except Exception as e:
            print(f"Error saving DataFrame to CSV: {e}")

    else:
        print("No card data was scraped.")

Starting scraping process...

--- Processing Page 1/85 ---
URL: https://www.db.yugioh-card.com/yugiohdb/card_search.action?ope=1&sess=1&rp=100&mode=1&sort=1&keyword=&stype=1&ctype=1&othercon=2&starfr=&starto=&pscalefr=&pscaleto=&linkmarkerfr=&linkmarkerto=&link_m=2&atkfr=&atkto=&deffr=&defto=&releaseDStart=1&releaseMStart=1&releaseYStart=1999&releaseDEnd=&releaseMEnd=&releaseYEnd=&page=1
Found 100 card entries on page 1.

--- Processing Page 2/85 ---
URL: https://www.db.yugioh-card.com/yugiohdb/card_search.action?ope=1&sess=1&rp=100&mode=1&sort=1&keyword=&stype=1&ctype=1&othercon=2&starfr=&starto=&pscalefr=&pscaleto=&linkmarkerfr=&linkmarkerto=&link_m=2&atkfr=&atkto=&deffr=&defto=&releaseDStart=1&releaseMStart=1&releaseYStart=1999&releaseDEnd=&releaseMEnd=&releaseYEnd=&page=2
Found 100 card entries on page 2.

--- Processing Page 3/85 ---
URL: https://www.db.yugioh-card.com/yugiohdb/card_search.action?ope=1&sess=1&rp=100&mode=1&sort=1&keyword=&stype=1&ctype=1&othercon=2&starfr=&starto=

In [21]:
import json
import math

# File Paths
csv_path_local = 'yugioh_monster_cards.csv'
json_path_local = 'yugioh_monster_cards.json'

print(f"Attempting to read CSV file: {csv_path_local}")

try:
    # Read CSV
    df = pd.read_csv(csv_path_local)
    print(f"Successfully read {len(df)} rows from {csv_path_local}")
    print("Original DataFrame info:")
    df.info()
    print("\nSample of original 'Type' column before cleaning:")
    if 'Type' in df.columns:
      problematic_types = df[df['Type'].astype(str).str.contains('／', na=False)]['Type'].head()
      if not problematic_types.empty:
          print(problematic_types)
      else:
          print("No types with '／' found in the sample head.")
    print("\nSample of original 'Level', 'ATK', 'DEF' columns:")
    print(df[['Name', 'Level', 'ATK', 'DEF']].head(10))

    # Fix 'Type' column
    if 'Type' in df.columns:
        print("\nCleaning the 'Type' column...")
        original_type_nan_mask = df['Type'].isna()
        df['Type'] = df['Type'].astype(str).str.replace('／', '/', regex=False)
        df.loc[df['Type'].str.lower() == 'nan', 'Type'] = None
        df.loc[original_type_nan_mask, 'Type'] = None
        print("Cleaning complete.")
    else:
        print("Warning: 'Type' column not found in the CSV. Skipping cleaning step.")

    # Process 'Level' column
    if 'Level' in df.columns:
        print("\nFilling missing 'Level' values with 0 and converting to int...")
        try:
             df['Level'] = pd.to_numeric(df['Level'], errors='coerce')
             df['Level'].fillna(0, inplace=True)
             df['Level'] = df['Level'].astype(int)
             print("Processing 'Level' complete.")
        except Exception as e:
             print(f"Could not process 'Level' column fully: {e}.")
    else:
        print("Warning: 'Level' column not found in the CSV.")

    # Process 'ATK' column
    if 'ATK' in df.columns:
        print("\nFilling missing 'ATK' values with 0 and converting to int...")
        try:
            df['ATK'] = pd.to_numeric(df['ATK'], errors='coerce')
            df['ATK'].fillna(0, inplace=True)
            df['ATK'] = df['ATK'].astype(int)
            print("Processing 'ATK' complete.")
        except Exception as e:
             print(f"Could not process 'ATK' column fully: {e}.")
    else:
        print("Warning: 'ATK' column not found in the CSV.")

    # Process 'DEF' column
    if 'DEF' in df.columns:
        print("\nFilling missing 'DEF' values with 0 and converting to int...")
        try:
            df['DEF'] = pd.to_numeric(df['DEF'], errors='coerce')
            df['DEF'].fillna(0, inplace=True)
            df['DEF'] = df['DEF'].astype(int)
            print("Processing 'DEF' complete.")
        except Exception as e:
             print(f"Could not process 'DEF' column fully: {e}.")
    else:
        print("Warning: 'DEF' column not found in the CSV.")

    print("\nDataFrame info after cleaning and filling:")
    df.info()
    print("\nSample data after cleaning/filling:")
    print(df[['Name', 'Level', 'ATK', 'DEF', 'Type']].head(10))

    # Convert DataFrame to JSON
    print(f"\nConverting DataFrame to JSON format (orient='records')...")
    data_list = df.to_dict(orient='records')

    # Clean up any remaining float NaNs before JSON dump
    for record in data_list:
        for key, value in record.items():
            if isinstance(value, float) and math.isnan(value):
                record[key] = None

    print(f"Saving cleaned data to JSON file: {json_path_local}")

    # Save to JSON file
    with open(json_path_local, 'w', encoding='utf-8') as f:
        json.dump(data_list, f, ensure_ascii=False, indent=4)

    print(f"\nSuccessfully saved cleaned data to {json_path_local}")
    print("You can find the JSON file in the Colab file browser.")


except FileNotFoundError:
    print(f"Error: The file '{csv_path_local}' was not found.")
    print("Please make sure the CSV file exists in the '/content/' directory in Colab.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Attempting to read CSV file: yugioh_monster_cards.csv
Successfully read 8467 rows from yugioh_monster_cards.csv
Original DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467 entries, 0 to 8466
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         8467 non-null   object 
 1   Attribute    8467 non-null   object 
 2   Level        8047 non-null   object 
 3   Type         8467 non-null   object 
 4   ATK          8387 non-null   float64
 5   DEF          7994 non-null   float64
 6   Description  8467 non-null   object 
dtypes: float64(2), object(5)
memory usage: 463.2+ KB

Sample of original 'Type' column before cleaning:
1       Insect/Flip／Effect
4        Fiend/Link／Effect
5        Beast/Flip／Effect
7     Machine/Union／Effect
9    Machine/Fusion／Effect
Name: Type, dtype: object

Sample of original 'Level', 'ATK', 'DEF' columns:
                               Name     Level     ATK     DEF


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Level'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ATK'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

Saving cleaned data to JSON file: yugioh_monster_cards.json

Successfully saved cleaned data to yugioh_monster_cards.json
You can find the JSON file in the Colab file browser.
