# Loading card data

In [1]:
import json
import time
import tqdm
import numpy as np
import openai

In [2]:
with open("oracle-cards-20250405210637.json", "r") as f:
    data = json.load(f)
print(f"Loaded {len(data)} cards")

Loaded 34458 cards


In [3]:
def get_card_data(oracle_data):
    if type(oracle_data) != dict:
        print(f"Invalid oracle data: {oracle_data}")
        return None
    if "memorabilia" in oracle_data.get("set_type", ""):
        # skip memorabilia cards
        return None
    if "//" in oracle_data.get("name", ""):
        # todo: handle double-faced cards separately
        return None
    if "mana_cost" not in oracle_data or oracle_data["mana_cost"] is None:
        print(f"Missing mana_cost for {oracle_data['name']}")
        return None
    if "oracle_text" not in oracle_data or oracle_data["oracle_text"] is None:
        print(f"Missing oracle_text for {oracle_data['name']}")
        return None
    
    # check if legal in some format
    # legal_formats = oracle_data.get("legalities", []).items()
    # legal_formats = [f for f, l in legal_formats if l == "legal"]
    # if not legal_formats:
    #     # include un-set cards
    #     if 
    #     print(f"Not legal in any format for {oracle_data['name']}")
    #     return None
    return {
        "name": oracle_data["name"],
        "mana_cost": oracle_data["mana_cost"],
        "cmc": oracle_data["cmc"],
        "type": oracle_data["type_line"],
        "text": oracle_data["oracle_text"],
        "power": oracle_data.get("power"),
        "toughness": oracle_data.get("toughness"),
        "loyalty": oracle_data.get("loyalty"),
        "colors": oracle_data.get("colors"),
        "keywords": oracle_data.get("keywords"),
    }

In [6]:
all_cards_by_name = {card["name"]: get_card_data(card) for card in data}
all_cards_by_name = {k: v for k, v in all_cards_by_name.items() if v is not None}

# Select a subset of cards to process

In [7]:
# A number of classic magic cards
sample_cards = [
    "Counterspell",
    "Mana Drain",
    "Daze",
    "Force of Will",
    "Spell Snare",
    "Mana Leak",
    "Remand",
    "Spell Pierce",
    "Negate",
    "Dispel",
    "Logic Knot",
    "Dash Hopes",
    "Dovin's Veto",
    "Cancel",
    "Arcane Denial",
    "Counterbalance",
    "Doomsday",
    "Cryptic Command",
    "Swan Song",
    "Flusterstorm",
    "Pact of Negation",
    "Spell Queller",
    "Stubborn Denial",
    "Mystical Dispute",
    "Disdainful Stroke",
    "Unwind",
    "Counterflux",
    "Narset's Reversal",
    "Siren Stormtamer",
    "Mana Tithe",
    "Sphinx's Revelation",
    "Dismember",
    "Swords to Plowshares",
    "Path to Exile",
    "Terminate",
    "Lightning Bolt",
    "Shock",
]

sample_cards = [card for card in sample_cards if card in all_cards_by_name]
sample_cards = list(all_cards_by_name.keys())
print(len(sample_cards), "sample cards")

31095 sample cards


# Format card data into text descriptions

In [8]:
def format_card(card_data):
    """Format a card for input into a vector-embedding model."""
    colors = list(map(lambda ch: {"W": "White", "U": "Blue", "B": "Black", "R": "Red", "G": "Green"}.get(ch, ch), card_data.get("colors", [])))
    prompt = f"""
The following is a card from the game Magic: The Gathering.

Name: {card_data["name"]}
Mana cost: {card_data["mana_cost"]}
Converted mana cost: {card_data["cmc"]}
Type Line: <type> {card_data["type"]} </type>
Colors: <colors> {", ".join(colors)} </colors>
Oracle text: <oracle_text> {card_data["text"] or "None"} </oracle_text>
"""
    if card_data["loyalty"] is not None:
        prompt += f"""Loyalty: {card_data["loyalty"]}
"""

    if card_data["power"] is not None and card_data["toughness"] is not None:
        prompt += f"""Power: {card_data["power"]}
Toughness: {card_data["toughness"]}
"""
    if card_data["keywords"] is not None and len(card_data["keywords"]) > 0:
        prompt += f"""Keywords: <keywords> {", ".join(card_data["keywords"])} </keywords>
"""
    return prompt.strip()

# sample_cards = list(all_cards_by_name.keys())
formatted_cards = [format_card(all_cards_by_name[card]) for card in sample_cards]
postprocessed_formatted_cards = formatted_cards
print(all_cards_by_name[sample_cards[-10]])
print(postprocessed_formatted_cards[-10])

{'name': 'Curse of Thirst', 'mana_cost': '{4}{B}', 'cmc': 5.0, 'type': 'Enchantment — Aura Curse', 'text': "Enchant player\nAt the beginning of enchanted player's upkeep, this Aura deals damage to that player equal to the number of Curses attached to them.", 'power': None, 'toughness': None, 'loyalty': None, 'colors': ['B'], 'keywords': ['Enchant']}
The following is a card from the game Magic: The Gathering.

Name: Curse of Thirst
Mana cost: {4}{B}
Converted mana cost: 5.0
Type Line: <type> Enchantment — Aura Curse </type>
Colors: <colors> Black </colors>
Oracle text: <oracle_text> Enchant player
At the beginning of enchanted player's upkeep, this Aura deals damage to that player equal to the number of Curses attached to them. </oracle_text>
Keywords: <keywords> Enchant </keywords>


In [9]:
# print([postprocessed_formatted_cards[i] for i, name in enumerate(all_cards_by_name.keys()) if "Planeswalker" in postprocessed_formatted_cards[i]][0])

In [10]:
for prompt in formatted_cards[:2]:
    print(prompt)
    print("=" * 80)

The following is a card from the game Magic: The Gathering.

Name: Nissa, Worldsoul Speaker
Mana cost: {3}{G}
Converted mana cost: 4.0
Type Line: <type> Legendary Creature — Elf Druid </type>
Colors: <colors> Green </colors>
Oracle text: <oracle_text> Landfall — Whenever a land you control enters, you get {E}{E} (two energy counters).
You may pay eight {E} rather than pay the mana cost for permanent spells you cast. </oracle_text>
Power: 3
Toughness: 3
Keywords: <keywords> Landfall </keywords>
The following is a card from the game Magic: The Gathering.

Name: Static Orb
Mana cost: {3}
Converted mana cost: 3.0
Type Line: <type> Artifact </type>
Colors: <colors>  </colors>
Oracle text: <oracle_text> As long as this artifact is untapped, players can't untap more than two permanents during their untap steps. </oracle_text>


# [Optional][WIP] Use an LLM to expand the card descriptions

In [7]:
def prepare_postprocessing_prompts(card_descriptions):
    prompts = []
    for text in card_descriptions:
        prompt = f"""
## ROLE
You are an expert Magic: the Gathering rules analyst.

## TASK
Generate compact, retrieval-oriented annotations for the card below.  
Return ONLY the JSON object described in *Output schema* (inside a ```json block).  
Do **not** repeat the card's rules text or name.

## INPUT
{text}
""" + """
## OUTPUT SCHEMA
```json
{
  "mechanics": ["<up to 7 MTG keywords or shorthand, e.g. \"ETB\", \"dies trigger\", \"lifegain\" >"],
  "roles":    ["<card roles: ramp, removal, finisher, toolbox, etc.>"],
  "strategies":["<decks or archetypes it fits: aristocrats, blink, etc.>"],
  "synergies":["<key tribes, card types, or mechanics it combines with>"],
  "power_band":"<one of: low | medium | high>",
  "why_pick": "< brief sentence on if, or why players use it >"
}
```
"""
        prompts.append(prompt.strip())
    return prompts

postprocessing_prompts = prepare_postprocessing_prompts(formatted_cards)

## Using Qwen locally with vLLM

In [8]:
from vllm import LLM, SamplingParams

sampling_params = SamplingParams(temperature=0.3, top_p=0.95, max_tokens=512)
llm = LLM(model="Qwen/Qwen2.5-7B-Instruct", quantization="fp8")
outputs = llm.chat(messages=[{"role": "user", "content": prompt} for prompt in postprocessing_prompts], sampling_params=sampling_params)

  from .autonotebook import tqdm as notebook_tqdm


INFO 04-23 15:51:07 [__init__.py:239] Automatically detected platform cuda.


2025-04-23 15:51:07,749	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 04-23 15:51:11 [config.py:689] This model supports multiple tasks: {'score', 'embed', 'generate', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 04-23 15:51:11 [config.py:1901] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-23 15:51:12 [core.py:61] Initializing a V1 LLM engine (v0.8.4) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoi

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.85it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.78it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  2.66it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.63it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.67it/s]



INFO 04-23 15:51:15 [loader.py:458] Loading weights took 1.54 seconds
INFO 04-23 15:51:15 [gpu_model_runner.py:1291] Model loading took 8.1372 GiB and 1.889585 seconds
INFO 04-23 15:51:19 [backends.py:416] Using cache directory: /home/benchislett/.cache/vllm/torch_compile_cache/fdfad17ec6/rank_0_0 for vLLM's torch.compile
INFO 04-23 15:51:19 [backends.py:426] Dynamo bytecode transform time: 3.94 s
INFO 04-23 15:51:21 [backends.py:132] Cache the graph of shape None for later use
INFO 04-23 15:51:32 [backends.py:144] Compiling a graph for general shape takes 12.07 s
INFO 04-23 15:51:43 [monitor.py:33] torch.compile takes 16.00 s in total
INFO 04-23 15:51:44 [kv_cache_utils.py:634] GPU KV cache size: 50,512 tokens
INFO 04-23 15:51:44 [kv_cache_utils.py:637] Maximum concurrency for 32,768 tokens per request: 1.54x
INFO 04-23 15:52:21 [gpu_model_runner.py:1626] Graph capturing finished in 37 secs, took 0.49 GiB
INFO 04-23 15:52:21 [core.py:163] init engine (profile, create kv cache, warmup 

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.56s/it, est. speed input: 4342.67 toks/s, output: 27.37 toks/s]


In [16]:
outputs = llm.chat(messages=[[{"role": "user", "content": prompt}] for prompt in postprocessing_prompts], sampling_params=sampling_params)

Processed prompts: 100%|██████████| 37/37 [00:02<00:00, 13.17it/s, est. speed input: 4260.25 toks/s, output: 1113.38 toks/s]


In [36]:
def parse_output(model_output):
    """Parse the model output into a JSON object."""
    try:
        model_output = model_output.strip("```").strip("json").strip()
        return json.loads(model_output)
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON: {e}")
        return None

print(formatted_cards[2])
print('\n\n\n')
parse_output(outputs[2].outputs[0].text)

The following is a card from the game Magic: The Gathering.

Name: Daze
Mana cost: {1}{U}
Converted mana cost: 2.0
Type Line: Instant
Oracle text: You may return an Island you control to its owner's hand rather than pay this spell's mana cost.
Counter target spell unless its controller pays {1}.
Power: None
Toughness: None
Loyalty: None






{'mechanics': ['Instant', 'Counter', 'Return'],
 'roles': ['Removal', 'Ramp'],
 'strategies': ['Control', 'Blue Weenie'],
 'synergies': ['Counterspells', 'Islands', 'Blue Spells'],
 'power_band': 'medium',
 'why_pick': 'Flexible removal and mana ramp in a single card.'}

In [45]:
print(postprocessing_prompts[2])

## ROLE
You are an expert Magic: the Gathering rules analyst.

## TASK
Generate compact, retrieval-oriented annotations for the card below.  
Return ONLY the JSON object described in *Output schema* (inside a ```json block).  
Do **not** repeat the card's rules text or name.

## INPUT
The following is a card from the game Magic: The Gathering.

Name: Daze
Mana cost: {1}{U}
Converted mana cost: 2.0
Type Line: Instant
Oracle text: You may return an Island you control to its owner's hand rather than pay this spell's mana cost.
Counter target spell unless its controller pays {1}.
Power: None
Toughness: None
Loyalty: None

## OUTPUT SCHEMA
```json
{
  "mechanics": ["<up to 7 MTG keywords or shorthand, e.g. "ETB", "dies trigger", "lifegain" >"],
  "roles":    ["<card roles: ramp, removal, finisher, toolbox, etc.>"],
  "strategies":["<decks or archetypes it fits: aristocrats, blink, etc.>"],
  "synergies":["<key tribes, card types, or mechanics it combines with>"],
  "power_band":"<one of: lo

# Embed the card descriptions

In [11]:
len(formatted_cards)

31095

In [19]:
embedding_client = openai.Client(base_url="http://localhost:30000/v1", api_key="None")

In [None]:
all_embeddings = []

batch_size = 100
for i in tqdm.tqdm(range(0, len(formatted_cards), batch_size)):
    batch = formatted_cards[i:i + batch_size]
    response = embedding_client.embeddings.create(
        model="Alibaba-NLP/gte-Qwen2-7B-instruct",
        input=batch,
        user="user"
    )
    all_embeddings.extend(map(lambda s: s.embedding, response.data))

len(all_embeddings), len(all_embeddings[0])

100%|██████████| 311/311 [23:46<00:00,  4.59s/it]


(31095, 3584)

In [40]:
all_embeddings_np = np.array(all_embeddings)
all_embeddings_np.shape, all_embeddings_np.dtype

((31095, 3584), dtype('float64'))

In [41]:
np.save("formatted_cards_embeddings.npy", all_embeddings_np)

In [12]:
all_embeddings = np.load("formatted_cards_embeddings.npy")

In [13]:
import textwrap
import chromadb
import pandas as pd
from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings

In [14]:
chroma_client = chromadb.Client()
db = chroma_client.create_collection(
    "MTGCardsDatabase",
)

In [17]:
assert len(all_embeddings) == len(formatted_cards)
batch_size = 100
for batch in tqdm.tqdm(range(0, len(formatted_cards), batch_size)):
    batch_documents = formatted_cards[batch:batch + batch_size]
    batch_embeddings = all_embeddings[batch:batch + batch_size]
    batch_ids = sample_cards[batch:batch + batch_size]
    db.add(
        documents=batch_documents,
        embeddings=batch_embeddings,
        ids=batch_ids,
    )
# db.add(
#     documents=formatted_cards,
#     embeddings=all_embeddings,
#     ids=sample_cards,
# )

  0%|          | 0/311 [00:00<?, ?it/s]

100%|██████████| 311/311 [00:25<00:00, 12.12it/s]


In [40]:
test_query = "colours Black, Green elf draw a card"
query_embedding = response = embedding_client.embeddings.create(
    model="Alibaba-NLP/gte-Qwen2-7B-instruct",
    input=test_query,
    user="user"
)

In [41]:
query_embedding = np.array(query_embedding.data[0].embedding)
query_embedding

array([-0.01178741,  0.01060486,  0.01307678, ..., -0.02017212,
       -0.01441193, -0.00382042], shape=(3584,))

In [50]:
res = db.query(
    query_embeddings=[query_embedding],
    n_results=20,
)

In [51]:
res["documents"]

[['The following is a card from the game Magic: The Gathering.\n\nName: Young Necromancer\nMana cost: {4}{B}\nConverted mana cost: 5.0\nType Line: <type> Creature — Human Warlock </type>\nColors: <colors> Black </colors>\nOracle text: <oracle_text> When this creature enters, you may exile two cards from your graveyard. When you do, return target creature card from your graveyard to the battlefield. </oracle_text>\nPower: 2\nToughness: 3',
  'The following is a card from the game Magic: The Gathering.\n\nName: Skemfar Shadowsage\nMana cost: {3}{B}\nConverted mana cost: 4.0\nType Line: <type> Creature — Elf Cleric </type>\nColors: <colors> Black </colors>\nOracle text: <oracle_text> When this creature enters, choose one —\n• Each opponent loses X life, where X is the greatest number of creatures you control that have a creature type in common.\n• You gain X life, where X is the greatest number of creatures you control that have a creature type in common. </oracle_text>\nPower: 2\nToughne

In [52]:
# format the documents with the query for re-ranking
documents_string = ""
for i, doc in enumerate(res["documents"][0]):
    documents_string += f"[{i + 1}]: {doc}\n"

rerank_prompt = f"""
You are a language model responsible for re-ranking search findings in an application for finding Magic: The Gathering cards.
Your purpose is to rank documents based on their relevance to the user's query. Consider the query details, the content of the card descriptions, and the context of the game.
The following is a user query: <query> {test_query} </query>. 

Please output only a JSON object with the schema described below. Do not include any other text or explanations. Ensure that the output is valid JSON, and that all documents are included in the ranking exactly once.

Here is an example of the output format assuming there are 10 input documents and they are already in the correct order:
```json
{"{"}
  "ranking": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
{"}"}
```

Here are the documents:\n\n""" + documents_string

llm_client = openai.Client(base_url="http://localhost:30001/v1", api_key="None")
rerank_response = llm_client.chat.completions.create(
    model="google/gemma-3-27b-it",
    messages=[{"role": "user", "content": rerank_prompt}],
)


In [53]:
print(rerank_response)

ChatCompletion(id='8c579c06394545f48325f1bb4a17e7f6', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Okay, I understand. You want me to re-rank Magic: The Gathering card results based on a user\'s query. I will act as the re-ranking engine, focusing on relevance to the query, card content, and game context.\n\nHowever, you\'ve provided a very strange and fragmented set of "documents" and a "query." It looks like a lot of text has been chopped up and concatenated. I\'m going to assume you want me to still demonstrate how I would work, so I\'ll treat each of the sections delimited by "text" and "Colors" as a potential card or document. I will also try to infer the user query based on the overall content.\n\n**Inferred User Query:** "Find me cards that interact with or are related to \'life,\' \'sacrifice,\' \'control,\' \'draw,\' \'the,\' \'a,\' \'and,\' \'or,\' \'the,\' \'to,\' \'with,\' and \'this,\' as these words appear repeatedly

In [54]:
def extract_rerank_response(rerank_response):
    """Extract the rerank response from the model output."""
    try:
        rerank_response = rerank_response.choices[0].message.content
        start_index = rerank_response.index("```json") + len("```json") + 1
        # end index is at the ``` after the JSON
        end_index = start_index + rerank_response[start_index:].index("```")
        data = json.loads(rerank_response[start_index:end_index])
        ranking = data["ranking"]
        if not isinstance(ranking, list):
            raise ValueError("Ranking is not a list")
        # if len(ranking) != len(res["documents"][0]):
        #     raise ValueError("Ranking length does not match number of documents")
        # if not len(set(ranking)) == len(ranking):
        #     raise ValueError("Ranking contains duplicates")
        return [i - 1 for i in ranking]  # convert to 0-indexed
    except ValueError as e:
        print(f"Failed to extract JSON: {e}")
        return None

print(extract_rerank_response(rerank_response))

Failed to extract JSON: substring not found
None


In [55]:
[res["documents"][0][i] for i in extract_rerank_response(rerank_response)]

Failed to extract JSON: substring not found


TypeError: 'NoneType' object is not iterable