Dataset Ingestion and EDA

In [11]:
import json
import pandas as pd

with open("AllPrintings.json", "r", encoding="utf-8") as f:
    data = json.load(f)

cards=[]

for set_code, set_data in data["data"].items():
    for card in set_data.get("cards", []):
        legalities = card.get("legalities", {})
        
        if legalities.get("commander") == "Legal":
            cards.append({
                "name": card.get("name"),
                "uuid": card.get("uuid"),
                "manaValue": card.get("manaValue"),
                "power": card.get("power"),
                "toughness": card.get("toughness"),
                "colors": card.get("colors"),
                "colorIdentity": card.get("colorIdentity"),
                "types": card.get("types"),
                "rarity": card.get("rarity"),
                "isLegendary": (
                "Legendary" in card.get("supertypes", [])
                if isinstance(card.get("supertypes"), list)
                else False
                ) ,
                "text": card.get("text")
            })

df = pd.DataFrame(cards)
df = df.drop_duplicates(subset=["name"])
#df.to_csv("commander_cards.csv", index=False)


In [3]:
df.head()

Unnamed: 0,name,uuid,manaValue,power,toughness,colors,colorIdentity,types,rarity,isLegendary,text
0,Ancestor's Chosen,5f8287b1-5bb6-5f4c-ad17-316a40d5bb0c,7.0,4.0,4.0,[W],[W],[Creature],uncommon,,First strike (This creature deals combat damag...
2,Angel of Mercy,57aaebc1-850c-503d-9f6e-bb8d00d8bf7c,5.0,3.0,3.0,[W],[W],[Creature],uncommon,,"Flying\nWhen this creature enters, you gain 3 ..."
4,Angelic Blessing,55bd38ca-dc73-5c06-8f80-a6ddd2f44382,3.0,,,[W],[W],[Sorcery],common,,Target creature gets +3/+3 and gains flying un...
6,Angelic Chorus,3b77bb52-4181-57f5-b3cd-f3a15b95aa29,5.0,,,[W],[W],[Enchantment],rare,,"Whenever a creature you control enters, you ga..."
7,Angelic Wall,fadda48c-6226-5ac5-a2b9-e9170d2017cd,2.0,0.0,4.0,[W],[W],[Creature],common,,Defender (This creature can't attack.)\nFlying


In [None]:
import pandas as pd
from collections import Counter

#size
print("Dataset Shape (rows, columns):")
print(df.shape)

print("\nColumn Names:")
print(df.columns.tolist())

# data types
print("\nData Types:")
print(df.dtypes)

#Clean numeric + boolean columns
df['power'] = pd.to_numeric(df['power'], errors='coerce')
df['toughness'] = pd.to_numeric(df['toughness'], errors='coerce')
df['isLegendary'] = (
    df['isLegendary']
    .fillna(False)
    .infer_objects(copy=False)
    .astype(bool)
)

#Missing values
print("\nMissing Values (count):")
print(df.isnull().sum())

print("\nMissing Values (%):")
print((df.isnull().mean() * 100).round(2))

Dataset Shape (rows, columns):
(30465, 11)

Column Names:
['name', 'uuid', 'manaValue', 'power', 'toughness', 'colors', 'colorIdentity', 'types', 'rarity', 'isLegendary', 'text']

Data Types:
name              object
uuid              object
manaValue        float64
power             object
toughness         object
colors            object
colorIdentity     object
types             object
rarity            object
isLegendary         bool
text              object
dtype: object

Missing Values (count):
name                 0
uuid                 0
manaValue            0
power            13665
toughness        13614
colors               0
colorIdentity        0
types                0
rarity               0
isLegendary          0
text               349
dtype: int64

Missing Values (%):
name              0.00
uuid              0.00
manaValue         0.00
power            44.85
toughness        44.69
colors            0.00
colorIdentity     0.00
types             0.00
rarity            0.00


In [None]:
#mana value
print("\nMana Value Summary:")
print(df['manaValue'].describe())

print("\nMana Value Skewness:")
print(df['manaValue'].skew())

# power and toughness for creatures
print("\nPower & Toughness Summary:")
print(df[['power', 'toughness']].describe())

print("\nMax Power:", df['power'].max())
print("Max Toughness:", df['toughness'].max())


Mana Value Summary:
count    30465.000000
mean         3.289414
std          1.756751
min          0.000000
25%          2.000000
50%          3.000000
75%          4.000000
max         16.000000
Name: manaValue, dtype: float64

Mana Value Skewness:
0.6101742211400548

Power & Toughness Summary:
              power     toughness
count  16800.000000  16851.000000
mean       2.672619      2.885526
std        1.687776      1.703656
min       -1.000000     -1.000000
25%        2.000000      2.000000
50%        2.000000      3.000000
75%        3.000000      4.000000
max       20.000000     30.000000

Max Power: 20.0
Max Toughness: 30.0


In [None]:
#Rarity Dist
print("\nRarity Counts:")
print(df['rarity'].value_counts())

print("\nRarity Percentages:")
print((df['rarity'].value_counts(normalize=True) * 100).round(2))

# num of colors
df['num_colors'] = df['colors'].apply(lambda x: len(x) if isinstance(x, list) else 0)

print("\nNumber of Colors Distribution:")
print(df['num_colors'].value_counts().sort_index())

# card type freq
type_counts = Counter()

for t in df['types']:
    if isinstance(t, list):
        type_counts.update(t)

print("\nCard Type Counts:")
print(type_counts)

#legendary proportion
print("\nLegendary Distribution (%):")
print((df['isLegendary'].value_counts(normalize=True) * 100).round(2))


Rarity Counts:
rarity
common      10021
rare         9411
uncommon     9147
mythic       1877
special         9
Name: count, dtype: int64

Rarity Percentages:
rarity
common      32.89
rare        30.89
uncommon    30.02
mythic       6.16
special      0.03
Name: proportion, dtype: float64

Number of Colors Distribution:
num_colors
0     3826
1    22648
2     3278
3      646
4       15
5       52
Name: count, dtype: int64

Card Type Counts:
Counter({'Creature': 16816, 'Instant': 3507, 'Enchantment': 3486, 'Artifact': 3367, 'Sorcery': 3228, 'Land': 1066, 'Planeswalker': 293, 'Kindred': 73, 'Stickers': 48, 'Battle': 36})

Legendary Distribution (%):
isLegendary
False    88.66
True     11.34
Name: proportion, dtype: float64
