<a href="https://colab.research.google.com/github/doctorsmylie/mtg-draft-agent/blob/main/scrape/scryfall_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: link with drive

from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# For managing paths
import pathlib

Loading data

In [None]:
# Folder containing all the data
folder = "/content/drive/MyDrive/Erdos25/MTGdraft"

# Expansion code
expansion = "DSK"
scryfall_foldername = "Scryfall-data"

In [None]:
# load Scryfall data
scryfall_filename = "default-cards-20250703091029.json"
scryfall_file = pathlib.Path(folder, scryfall_foldername, scryfall_filename)
scryfall_data = pd.read_json(scryfall_file)
print(scryfall_data.head())
print(scryfall_data.shape)
print(scryfall_data.columns)

  object                                    id  \
0   card  0000419b-0bba-4488-8f7a-6194544ce91e   
1   card  0000579f-7b35-4ed3-b44c-db2a538066fe   
2   card  00006596-1166-4a79-8443-ca9f82e6db4e   
3   card  0000a54c-a511-4925-92dc-01b937f9afad   
4   card  0000cd57-91fe-411f-b798-646e965eec37   

                              oracle_id multiverse_ids   mtgo_id  arena_id  \
0  b34bb2dc-c1af-4d77-b0b3-a0fb342a5fc6       [668564]  129825.0   91829.0   
1  44623693-51d6-49ad-8cd7-140505caf02f       [109722]   25527.0       NaN   
2  8ae3562f-28b7-4462-96ed-be0cf7052ccc       [189637]   34586.0       NaN   
3  dc4e2134-f0c2-49aa-9ea3-ebf83af1445c             []       NaN       NaN   
4  9f0d82ae-38bf-45d8-8cda-982b6ead1d72       [435231]   65170.0   66119.0   

   tcgplayer_id  cardmarket_id           name lang  ... flavor_name  \
0      558404.0       777725.0         Forest   en  ...         NaN   
1       14240.0        13850.0    Fury Sliver   en  ...         NaN   
2       33347.0  

In [None]:
# load gamedata
gamefilename = "game_data_public.DSK.PremierDraft.csv.gz"
game_file = pathlib.Path(folder, expansion, gamefilename)
gamedata = pd.read_csv(game_file, compression="gzip", nrows=100000)
# Renaming Basic Land Columns for them to stand out
basic_lands_columns = list(
    gamedata.filter(regex="Plains$|Island$|Swamp$|Mountain$|Forest$").columns
)
# add '_Basic_Land' to these columns
str_basic = "_Basic_Land"
basic_lands_dict = {key: key + str_basic for key in basic_lands_columns}
gamedata.rename(columns=basic_lands_dict, inplace=True)
gamedata.head()

Unnamed: 0,expansion,event_type,draft_id,draft_time,game_time,build_index,match_number,game_number,rank,opp_rank,...,tutored_Withering Torment,deck_Withering Torment,sideboard_Withering Torment,"opening_hand_Zimone, All-Questioning","drawn_Zimone, All-Questioning","tutored_Zimone, All-Questioning","deck_Zimone, All-Questioning","sideboard_Zimone, All-Questioning",user_n_games_bucket,user_game_win_rate_bucket
0,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,2024-09-24 19:14:19,0,1,1,diamond,,...,0,0,0,0,0,0,0,0,500,0.58
1,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,2024-09-24 19:22:30,0,2,1,diamond,,...,0,0,0,0,0,0,0,0,500,0.58
2,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,2024-09-24 19:31:35,0,3,1,diamond,,...,0,0,0,0,0,0,0,0,500,0.58
3,DSK,PremierDraft,53401b113a4f425fa26e60edd314dd27,2024-09-24 18:48:56,2024-09-24 19:38:59,0,4,1,diamond,,...,0,0,0,0,0,0,0,0,500,0.58
4,DSK,PremierDraft,d26766b56fd14670ba60c2604bcd457b,2024-09-24 20:10:55,2024-09-24 20:50:44,0,1,1,diamond,,...,0,0,0,0,0,0,0,0,500,0.58


In [None]:
from sre_constants import error

# prompt: filter the scryfall_data of a given set


def filter_set(scryfall_data, set_code):
    """
    Filters the scryfall_data DataFrame to only include cards from a given set
    and 'spg' (Special Guests) cards released on the same date.

    Args:
      scryfall_data: The pandas DataFrame containing the scryfall data.
      set_code: The 3-letter set code (e.g., 'woe').

    Returns:
      A pandas DataFrame containing cards from the specified set and
      relevant 'spg' cards.
    """
    names_to_avoid = ["Plains", "Island", "Swamp", "Mountain", "Forest"]
    filtered_with_basic_lands = scryfall_data[
        (scryfall_data["set"] == set_code.lower())
    ].copy()  # Add .copy() to avoid SettingWithCopyWarning

    # Check if filtered_df1 is empty before accessing released_at
    if filtered_with_basic_lands.empty:
        print(f"No cards found for set code '{set_code}'.")
        return pd.DataFrame()  # Return empty DataFrame if no cards in the main set
    # change 'collector_number' type to int
    filtered_with_basic_lands["collector_number"] = pd.to_numeric(
        filtered_with_basic_lands["collector_number"], errors="coerce"
    )
    filtered_with_basic_lands.sort_values(by="collector_number", inplace=True)

    rows_with_basic_lands = filtered_with_basic_lands[
        filtered_with_basic_lands["name"].isin(names_to_avoid)
    ]
    min_collector_number = 0
    if not rows_with_basic_lands.empty:
        min_collector_number = rows_with_basic_lands["collector_number"].min()
    filtered_df1 = filtered_with_basic_lands[
        filtered_with_basic_lands["collector_number"] < min_collector_number
    ].copy()
    filtered_df1["collector_number"] = filtered_df1["collector_number"].astype(int)

    # # #We filtered by collector number. Uncomment if we need to remove basic lands by name
    # filtered_df1 = filtered_with_basic_lands[~filtered_with_basic_lands['name'].isin(names_to_avoid)].copy() # Add .copy()

    if filtered_df1.empty:
        print(f"No cards found for set code '{set_code}'.")
        return pd.DataFrame()  # Return empty DataFrame if no cards in the main set

    date_special_guests = filtered_df1.iloc[0][
        "released_at"
    ]  # Use iloc[0] to access the first row regardless of index

    # Filter for 'spg' cards with the same release date
    filtered_df2 = scryfall_data[
        (scryfall_data["set"] == "spg")
        & (scryfall_data["released_at"] == date_special_guests)
    ].copy()  # Add .copy()
    filtered_df2["collector_number"] = pd.to_numeric(
        filtered_df2["collector_number"], errors="coerce"
    )
    filtered_df2["collector_number"] = filtered_df2["collector_number"].astype(
        int, errors="ignore"
    )
    filtered_df2.sort_values(by="collector_number", inplace=True)
    return pd.concat([filtered_df1, filtered_df2]).reset_index(drop=True)


# Example usage:
# Replace 'DSK' with the desired set code
filtered_data = filter_set(scryfall_data, expansion)
print(filtered_data.head())
print(filtered_data.shape)
print(filtered_data["collector_number"].dtype)

  object                                    id  \
0   card  6f1a7590-3eee-4803-b192-d4fb771e6a86   
1   card  9c9b8fbe-8a5e-4b62-b53f-9ead8147bbbb   
2   card  8e2fae80-60af-44cf-95b4-177837435d1a   
3   card  c5ee6651-9946-4bae-b21e-6cf28fa77b13   
4   card  3ba16fbf-2d44-4337-87cb-6ff6b84a258a   

                              oracle_id multiverse_ids   mtgo_id  arena_id  \
0  d8ce3f18-26fe-47e8-be8b-0d89d7de7038       [673406]  130133.0   92055.0   
1  c51da69b-7fd7-43a3-be29-c44fc0c36130       [673407]  130135.0   92056.0   
2  c46a02db-13d6-477f-9da0-822599470168       [673408]  130137.0   92057.0   
3  e633412b-e36b-4ec5-b0d0-7cb24c7503f4       [673409]  130139.0   92060.0   
4  fed2127c-7174-4863-ad6a-1847c96bbd3f       [673410]  130141.0   92063.0   

   tcgplayer_id  cardmarket_id                                   name lang  \
0      575155.0       786367.0                  Acrobatic Cheerleader   en   
1      576488.0       786858.0                            Cult Healer   en

In [None]:
setcols = [str for str in list(filtered_data.columns) if str.rfind("set") >= 0]
filtered_data[setcols]

Unnamed: 0,set_id,set,set_name,set_type,set_uri,set_search_uri,scryfall_set_uri
99,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api
217,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api
222,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api
338,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api
390,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api
...,...,...,...,...,...,...,...
34312,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api
34541,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api
34642,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api
34832,a111d8a9-b647-48ec-afab-2b78f92173f5,dsk,Duskmourn: House of Horror,expansion,https://api.scryfall.com/sets/a111d8a9-b647-48...,https://api.scryfall.com/cards/search?order=se...,https://scryfall.com/sets/dsk?utm_source=api


In [None]:
scryflist = np.unique(list(filtered_data["name"].values))
print(scryflist)
len(scryflist)

['Abandoned Campground' 'Abhorrent Oculus' 'Acrobatic Cheerleader'
 'Altanak, the Thrice-Called' 'Anthropede' 'Appendage Amalgam'
 'Arabella, Abandoned Doll' 'Attack-in-the-Box' 'Balemurk Leech'
 'Balustrade Wurm' 'Baseball Bat' 'Bashful Beastie' 'Bear Trap'
 'Beastie Beatdown' 'Bedhead Beastie' "Betrayer's Bargain"
 'Blazemire Verge' 'Bleeding Woods' 'Boilerbilges Ripper'
 'Bottomless Pool // Locker Room' 'Break Down the Door' 'Broodspinner'
 'Cackling Slasher' 'Cathartic Parting' 'Cautious Survivor'
 'Central Elevator // Promising Stairs' 'Chainsaw'
 'Charred Foyer // Warped Space' 'Clammy Prowler'
 'Clockwork Percussionist' 'Collected Company' 'Come Back Wrong'
 'Commune with Evil' 'Conductive Machete' 'Coordinated Clobbering'
 'Cracked Skull' 'Creeping Peeper' 'Cryptid Inspector' 'Cult Healer'
 'Cursed Recording' 'Cursed Windbreaker' 'Cynical Loner'
 'Daggermaw Megalodon' 'Damnation' 'Dashing Bloodsucker'
 'Dazzling Theater // Prop Room' 'Defiant Survivor'
 'Defiled Crypt // Cadave

281

In [None]:
col_list = list(gamedata.columns)
card_name_list = [
    str.replace("deck_", "")
    for str in col_list
    if str.startswith("deck") and (not str.endswith("_Basic_Land"))
]  #
print(card_name_list)
len(card_name_list)

['Abandoned Campground', 'Abhorrent Oculus', 'Acrobatic Cheerleader', 'Altanak, the Thrice-Called', 'Anthropede', 'Appendage Amalgam', 'Arabella, Abandoned Doll', 'Attack-in-the-Box', 'Balemurk Leech', 'Balustrade Wurm', 'Baseball Bat', 'Bashful Beastie', 'Bear Trap', 'Beastie Beatdown', 'Bedhead Beastie', "Betrayer's Bargain", 'Blazemire Verge', 'Bleeding Woods', 'Boilerbilges Ripper', 'Bottomless Pool // Locker Room', 'Break Down the Door', 'Broodspinner', 'Cackling Slasher', 'Cathartic Parting', 'Cautious Survivor', 'Central Elevator // Promising Stairs', 'Chainsaw', 'Charred Foyer // Warped Space', 'Clammy Prowler', 'Clockwork Percussionist', 'Collected Company', 'Come Back Wrong', 'Commune with Evil', 'Conductive Machete', 'Coordinated Clobbering', 'Cracked Skull', 'Creeping Peeper', 'Cryptid Inspector', 'Cult Healer', 'Cursed Recording', 'Cursed Windbreaker', 'Cynical Loner', 'Daggermaw Megalodon', 'Damnation', 'Dashing Bloodsucker', 'Dazzling Theater // Prop Room', 'Defiant Surv

281

In [None]:
testtable = filtered_data[["name", "collector_number"]]
print(testtable.dtypes)
testtable

name                 object
collector_number    float64
dtype: object


Unnamed: 0,name,collector_number
0,Acrobatic Cheerleader,1.0
1,Cult Healer,2.0
2,Dazzling Theater // Prop Room,3.0
3,Dollmaker's Shop // Porcelain Gallery,4.0
4,Emerge from the Cocoon,5.0
...,...,...
62997,Maddening Hex,70.0
66261,Phantasmal Image,67.0
79555,Soul Warden,65.0
89669,Expropriate,66.0


In [None]:
filtered_data[["name", "collector_number"]]

Unnamed: 0,name,collector_number
0,Acrobatic Cheerleader,1
1,Cult Healer,2
2,Dazzling Theater // Prop Room,3
3,Dollmaker's Shop // Porcelain Gallery,4
4,Emerge from the Cocoon,5
...,...,...
276,Sacrifice,69
277,Maddening Hex,70
278,Unholy Heat,71
279,Collected Company,72


In [None]:
scryfall_data[scryfall_data["name"] == "Enduring Courage"][["name", "collector_number"]]

Unnamed: 0,name,collector_number
54984,Enduring Courage,392
64073,Enduring Courage,133p
64314,Enduring Courage,378
71726,Enduring Courage,133s
74487,Enduring Courage,402
102932,Enduring Courage,133


###SAVE JSONS WITH NEW EXPANSIONS

In [None]:
expansion_list = ["woe", "lci", "mkm", "otj", "blb", "dsk", "dft", "tdm", "fin"]
df_dict = {
    expansion: filter_set(scryfall_data, expansion) for expansion in expansion_list
}

In [None]:
# prompt: save each dataframe in df_dict to a json

import json
import os

output_dir = pathlib.Path(folder, scryfall_foldername, "card-sets")
# Create the directory if it doesn't exist
if not output_dir.exists():
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

for expansion, df in df_dict.items():
    output_filename = pathlib.Path(output_dir, f"{expansion}_cards.json")
    df.to_json(output_filename, orient="records", indent=2)
    print(f"Saved {expansion} data to {output_filename}")

Created directory: /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets
Saved woe data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/woe_cards.json
Saved lci data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/lci_cards.json
Saved mkm data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/mkm_cards.json
Saved otj data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/otj_cards.json
Saved blb data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/blb_cards.json
Saved dsk data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/dsk_cards.json
Saved dft data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/dft_cards.json
Saved tdm data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/tdm_cards.json
Saved fin data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/fin_cards.json
