# Loading card data

In [1]:
import json
with open("oracle-cards-20250405210637.json", "r") as f:
    data = json.load(f)
print(f"Loaded {len(data)} cards")

Loaded 34458 cards


In [2]:
def get_card_data(oracle_data):
    if "mana_cost" not in oracle_data:
        return None
    if "oracle_text" not in oracle_data:
        return None
    return {
        "name": oracle_data["name"],
        "mana_cost": oracle_data["mana_cost"],
        "cmc": oracle_data["cmc"],
        "type": oracle_data["type_line"],
        "text": oracle_data["oracle_text"],
        "power": oracle_data.get("power"),
        "toughness": oracle_data.get("toughness"),
        "loyalty": oracle_data.get("loyalty"),
    }

In [3]:
all_cards_by_name = {card["name"]: get_card_data(card) for card in data}
all_cards_by_name = {k: v for k, v in all_cards_by_name.items() if v is not None}

# Select a subset of cards to process

In [4]:
# A number of classic magic cards
sample_cards = [
    "Counterspell",
    "Mana Drain",
    "Daze",
    "Force of Will",
    "Spell Snare",
    "Mana Leak",
    "Remand",
    "Spell Pierce",
    "Negate",
    "Dispel",
    "Logic Knot",
    "Dash Hopes",
    "Dovin's Veto",
    "Cancel",
    "Arcane Denial",
    "Counterbalance",
    "Doomsday",
    "Cryptic Command",
    "Swan Song",
    "Flusterstorm",
    "Pact of Negation",
    "Spell Queller",
    "Stubborn Denial",
    "Mystical Dispute",
    "Disdainful Stroke",
    "Unwind",
    "Counterflux",
    "Narset's Reversal",
    "Siren Stormtamer",
    "Mana Tithe",
    "Sphinx's Revelation",
    "Dismember",
    "Swords to Plowshares",
    "Path to Exile",
    "Terminate",
    "Lightning Bolt",
    "Shock",
]

sample_cards = [card for card in sample_cards if card in all_cards_by_name]

# Format card data into text descriptions

In [5]:
def format_card(card_data):
    """Format a card for input into a vector-embedding model."""
    prompt = f"""
The following is a card from the game Magic: The Gathering.

Name: {card_data["name"]}
Mana cost: {card_data["mana_cost"]}
Converted mana cost: {card_data["cmc"]}
Type Line: {card_data["type"]}
Oracle text: {card_data["text"]}
Power: {card_data["power"]}
Toughness: {card_data["toughness"]}
Loyalty: {card_data["loyalty"]}
"""
    return prompt.strip()

formatted_cards = [format_card(all_cards_by_name[card]) for card in sample_cards]
postprocessed_formatted_cards = formatted_cards

In [6]:
for prompt in formatted_cards[:2]:
    print(prompt)
    print("=" * 80)

The following is a card from the game Magic: The Gathering.

Name: Counterspell
Mana cost: {U}{U}
Converted mana cost: 2.0
Type Line: Instant
Oracle text: Counter target spell.
Power: None
Toughness: None
Loyalty: None
The following is a card from the game Magic: The Gathering.

Name: Mana Drain
Mana cost: {U}{U}
Converted mana cost: 2.0
Type Line: Instant
Oracle text: Counter target spell. At the beginning of your next main phase, add an amount of {C} equal to that spell's mana value.
Power: None
Toughness: None
Loyalty: None


# Use an LLM to expand the card descriptions

In [7]:
def prepare_postprocessing_prompts(card_descriptions):
    prompts = []
    for text in card_descriptions:
        prompt = f"""
## ROLE
You are an expert Magic: the Gathering rules analyst.

## TASK
Generate compact, retrieval-oriented annotations for the card below.  
Return ONLY the JSON object described in *Output schema* (inside a ```json block).  
Do **not** repeat the card's rules text or name.

## INPUT
{text}
""" + """
## OUTPUT SCHEMA
```json
{
  "mechanics": ["<up to 7 MTG keywords or shorthand, e.g. \"ETB\", \"dies trigger\", \"lifegain\" >"],
  "roles":    ["<card roles: ramp, removal, finisher, toolbox, etc.>"],
  "strategies":["<decks or archetypes it fits: aristocrats, blink, etc.>"],
  "synergies":["<key tribes, card types, or mechanics it combines with>"],
  "power_band":"<one of: low | medium | high>",
  "why_pick": "< brief sentence on if, or why players use it >"
}
```
"""
        prompts.append(prompt.strip())
    return prompts

postprocessing_prompts = prepare_postprocessing_prompts(formatted_cards)

## Using Qwen locally with vLLM

In [8]:
from vllm import LLM, SamplingParams

sampling_params = SamplingParams(temperature=0.3, top_p=0.95, max_tokens=512)
llm = LLM(model="Qwen/Qwen2.5-7B-Instruct", quantization="fp8")
outputs = llm.chat(messages=[{"role": "user", "content": prompt} for prompt in postprocessing_prompts], sampling_params=sampling_params)

  from .autonotebook import tqdm as notebook_tqdm


INFO 04-23 15:51:07 [__init__.py:239] Automatically detected platform cuda.


2025-04-23 15:51:07,749	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 04-23 15:51:11 [config.py:689] This model supports multiple tasks: {'score', 'embed', 'generate', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 04-23 15:51:11 [config.py:1901] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-23 15:51:12 [core.py:61] Initializing a V1 LLM engine (v0.8.4) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoi

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.85it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.78it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  2.66it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.63it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.67it/s]



INFO 04-23 15:51:15 [loader.py:458] Loading weights took 1.54 seconds
INFO 04-23 15:51:15 [gpu_model_runner.py:1291] Model loading took 8.1372 GiB and 1.889585 seconds
INFO 04-23 15:51:19 [backends.py:416] Using cache directory: /home/benchislett/.cache/vllm/torch_compile_cache/fdfad17ec6/rank_0_0 for vLLM's torch.compile
INFO 04-23 15:51:19 [backends.py:426] Dynamo bytecode transform time: 3.94 s
INFO 04-23 15:51:21 [backends.py:132] Cache the graph of shape None for later use
INFO 04-23 15:51:32 [backends.py:144] Compiling a graph for general shape takes 12.07 s
INFO 04-23 15:51:43 [monitor.py:33] torch.compile takes 16.00 s in total
INFO 04-23 15:51:44 [kv_cache_utils.py:634] GPU KV cache size: 50,512 tokens
INFO 04-23 15:51:44 [kv_cache_utils.py:637] Maximum concurrency for 32,768 tokens per request: 1.54x
INFO 04-23 15:52:21 [gpu_model_runner.py:1626] Graph capturing finished in 37 secs, took 0.49 GiB
INFO 04-23 15:52:21 [core.py:163] init engine (profile, create kv cache, warmup 

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.56s/it, est. speed input: 4342.67 toks/s, output: 27.37 toks/s]


In [16]:
outputs = llm.chat(messages=[[{"role": "user", "content": prompt}] for prompt in postprocessing_prompts], sampling_params=sampling_params)

Processed prompts: 100%|██████████| 37/37 [00:02<00:00, 13.17it/s, est. speed input: 4260.25 toks/s, output: 1113.38 toks/s]


In [36]:
def parse_output(model_output):
    """Parse the model output into a JSON object."""
    try:
        model_output = model_output.strip("```").strip("json").strip()
        return json.loads(model_output)
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON: {e}")
        return None

print(formatted_cards[2])
print('\n\n\n')
parse_output(outputs[2].outputs[0].text)

The following is a card from the game Magic: The Gathering.

Name: Daze
Mana cost: {1}{U}
Converted mana cost: 2.0
Type Line: Instant
Oracle text: You may return an Island you control to its owner's hand rather than pay this spell's mana cost.
Counter target spell unless its controller pays {1}.
Power: None
Toughness: None
Loyalty: None






{'mechanics': ['Instant', 'Counter', 'Return'],
 'roles': ['Removal', 'Ramp'],
 'strategies': ['Control', 'Blue Weenie'],
 'synergies': ['Counterspells', 'Islands', 'Blue Spells'],
 'power_band': 'medium',
 'why_pick': 'Flexible removal and mana ramp in a single card.'}

In [45]:
print(postprocessing_prompts[2])

## ROLE
You are an expert Magic: the Gathering rules analyst.

## TASK
Generate compact, retrieval-oriented annotations for the card below.  
Return ONLY the JSON object described in *Output schema* (inside a ```json block).  
Do **not** repeat the card's rules text or name.

## INPUT
The following is a card from the game Magic: The Gathering.

Name: Daze
Mana cost: {1}{U}
Converted mana cost: 2.0
Type Line: Instant
Oracle text: You may return an Island you control to its owner's hand rather than pay this spell's mana cost.
Counter target spell unless its controller pays {1}.
Power: None
Toughness: None
Loyalty: None

## OUTPUT SCHEMA
```json
{
  "mechanics": ["<up to 7 MTG keywords or shorthand, e.g. "ETB", "dies trigger", "lifegain" >"],
  "roles":    ["<card roles: ramp, removal, finisher, toolbox, etc.>"],
  "strategies":["<decks or archetypes it fits: aristocrats, blink, etc.>"],
  "synergies":["<key tribes, card types, or mechanics it combines with>"],
  "power_band":"<one of: lo

In [None]:
import time

all_embeddings = []

chunk_size = 10
sleep_time = 2 # seconds between chunk requests

for i in range(len(formatted_cards) // chunk_size + 1):
    cards = formatted_cards[i * chunk_size:(i + 1) * chunk_size]
    if not cards:
        break

    raise ValueError("Watch out, this line is expensive!")
    embeddings = client.models.embed_content(
        model=model,
        contents=cards,
        config=genai.types.EmbedContentConfig(
            task_type="retrieval_document",
            title="MTG card embeddings"
        )
    )
    for embedding in embeddings.embeddings:
        all_embeddings.append(np.array(embedding.values))

    print("finished chunk", i)
    print("out of ", len(formatted_cards) // chunk_size + 1)
    time.sleep(sleep_time)

finished chunk 0
out of  4
finished chunk 1
out of  4
finished chunk 2
out of  4
finished chunk 3
out of  4


In [72]:
import textwrap
import chromadb
import pandas as pd
from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings

In [None]:
chroma_client = chromadb.Client()
db = chroma_client.create_collection(
    "MTGCardsDatabase",
)

InternalError: Collection [MTGCardsDatabase] already exists

In [79]:
db.add(
    documents=formatted_cards,
    embeddings=all_embeddings,
    ids=sample_cards,
)

In [84]:
counterspell_index = sample_cards.index("Counterspell")
counterspell_embedding = all_embeddings[counterspell_index]
res = db.query(
    query_embeddings=[counterspell_embedding],
    n_results=20,
)

In [109]:
test_query = "A three mana spell that denies the opponent's spell"
query_embedding = client.models.embed_content(
    model=model,
    contents=[test_query],
    config=genai.types.EmbedContentConfig(
        task_type="retrieval_query",
    )
)

In [110]:
query_embedding = np.array(query_embedding.embeddings[0].values)

In [111]:
db.query(
    query_embeddings=[query_embedding],
    n_results=10,
)

{'ids': [['Arcane Denial',
   'Stubborn Denial',
   'Counterflux',
   'Mystical Dispute',
   'Mana Leak',
   'Dispel',
   'Force of Will',
   'Negate',
   'Spell Pierce',
   'Pact of Negation']],
 'embeddings': None,
 'documents': [["The following is a card from the game Magic: The Gathering.\n\n    Name: Arcane Denial\n    Mana cost: {1}{U}\n    Converted mana cost: 2.0\n    Type Line: Instant\n    Oracle text: Counter target spell. Its controller may draw up to two cards at the beginning of the next turn's upkeep.\nYou draw a card at the beginning of the next turn's upkeep.\n    Power: None\n    Toughness: None\n    Loyalty: None",
   'The following is a card from the game Magic: The Gathering.\n\n    Name: Stubborn Denial\n    Mana cost: {U}\n    Converted mana cost: 1.0\n    Type Line: Instant\n    Oracle text: Counter target noncreature spell unless its controller pays {1}.\nFerocious — If you control a creature with power 4 or greater, counter that spell instead.\n    Power: None