<a href="https://colab.research.google.com/github/doctorsmylie/mtg-draft-agent/blob/main/scrape/scryfall_scrape_polars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: link with drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import zipfile
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt

# For managing paths
import pathlib

###*Loading* data

In [3]:
# Folder containing all the data
folder='/content/drive/MyDrive/Erdos25/MTGdraft'

# Expansion code
expansion = 'DSK'
scryfall_foldername='Scryfall-data'


In [11]:
# # load ALL Scryfall data
# scryfall_filename='default-cards-20250703091029.json'
# scryfall_file = pathlib.Path(folder, scryfall_foldername, scryfall_filename)
# all_scryfall_data_df=pd.read_json(scryfall_file)

In [20]:
# #  Save to parquet all cards released after february 2021
# filtered_scryfall_data = all_scryfall_data_df[all_scryfall_data_df['released_at'] > '2021-02-01']
# saved_filename = 'scryfall_cards_after_feb_2021.parquet'
# saved_filepath = pathlib.Path(folder, scryfall_foldername, saved_filename)
# filtered_scryfall_data.to_parquet(saved_filepath)

In [19]:
# all_scryfall_data_df.dtypes.to_frame('dtypes').T

Unnamed: 0,object,id,oracle_id,multiverse_ids,mtgo_id,arena_id,tcgplayer_id,cardmarket_id,name,lang,...,flavor_name,attraction_lights,color_indicator,printed_type_line,printed_text,variation_of,life_modifier,hand_modifier,content_warning,defense
dtypes,object,object,object,object,float64,float64,float64,float64,object,object,...,object,object,object,object,object,object,float64,float64,float64,float64


In [22]:
# load Scryfall data after 2021
scryfall_filename='scryfall_cards_after_feb_2021.parquet'
scryfall_file = pathlib.Path(folder, scryfall_foldername, scryfall_filename)
scryfall_data=pl.read_parquet(scryfall_file)


In [33]:
col_num_series =scryfall_data.with_columns(
    pl.col('collector_number').cast(pl.Int16, strict=False)
)
col_num_series['collector_number']

collector_number
i16
280
""
57
63
18
…
12
91
121
14


In [105]:
from sre_constants import error
# prompt: filter the scryfall_data of a given set
def filter_base_set(scryfall_data, set_code):
  """
  Filters the scryfall_data DataFrame to only include cards from a given set.
  Args:
    scryfall_data: The pandas DataFrame containing the scryfall data.
    set_code: The 3-letter set code (e.g., 'woe').
  Returns:
    A polars DataFrame containing cards from the specified set.
  """
  set_code= set_code.lower()
  basic_land_names= ['Plains','Island','Swamp','Mountain','Forest']
  filtered_with_basic_lands = scryfall_data.filter(
    pl.col('set') == set_code)
  # Check if filtered_df1 is empty before accessing released_at
  if filtered_with_basic_lands.shape[0] == 0:
      print(f"No cards found for set code '{set_code}'.")
      return pl.DataFrame() # Return empty DataFrame if no cards in the main set
  #change 'collector_number' type to int

  filtered_with_basic_lands = filtered_with_basic_lands.with_columns(
    pl.col('collector_number').cast(pl.Int16, strict=False)
  )
  filtered_with_basic_lands = filtered_with_basic_lands.sort('collector_number')
  first_land_collector_number = filtered_with_basic_lands.filter(
      pl.col('name') == pl.lit(basic_land_names[0])
  ).select('collector_number').item(0,0)
  filtered_without_basic_lands = filtered_with_basic_lands.filter(
      pl.col('collector_number') < first_land_collector_number
  )
  #make 'name' the first column
  filtered_without_basic_lands = filtered_without_basic_lands.select(
      pl.col('name').first().alias('name'),
      pl.exclude('name')
  )

  return filtered_without_basic_lands

def filter_extra_set(scryfall_data, set_code, date=None):
  """
  Filters the scryfall_data DataFrame to only include cards from a given set.
  Args:
    scryfall_data: The pandas DataFrame containing the scryfall data.
    set_code (of the extra set): The 3-letter set code (e.g., 'woe').
  Returns:
    A polars DataFrame containing cards from the specified set.
  """
  set_code= set_code.lower()
  filtered_data = scryfall_data.filter(pl.col('set') == set_code)
  if date != None:
    filtered_data = filtered_data.filter(pl.col('released_at') == date)
  filtered_data = filtered_data.with_columns(
    pl.col('collector_number').cast(pl.Int16, strict=False)
  )
  filtered_data = filtered_data.sort('collector_number')
  unique_name_data = filtered_data.group_by('name').first()
  unique_name_data = unique_name_data.sort('collector_number')

  return unique_name_data


def filter_set(scryfall_data, set_code):
  """
  Filters the scryfall_data DataFrame to only include cards from a given set
  and 'spg' (Special Guests) cards released on the same date.

  Args:
    scryfall_data: The pandas DataFrame containing the scryfall data.
    set_code: The 3-letter set code (e.g., 'woe').

  Returns:
    A polars DataFrame containing cards from the specified set and
    relevant 'spg' cards.
  """
  set_code= set_code.lower()
  release_date= scryfall_data.filter( pl.col('set') == set_code).select('released_at').item(0,0)
  set_type_list=['masterpiece','expansion']
  extra_sets_list = scryfall_data.filter(
    (pl.col('released_at') == release_date) & (pl.col('set_type').is_in(set_type_list)) & (pl.col('set') != set_code)
  ).get_column('set').unique()
  df_list=[]
  df_list.append(filter_base_set(scryfall_data, set_code))
  for extra_set in extra_sets_list:
    df_list.append(filter_extra_set(scryfall_data, extra_set, date=release_date))
  return pl.concat(df_list)

# Example usage:
# Replace 'DSK' with the desired set code
filtered_data = filter_set(scryfall_data, 'woe')
print(filtered_data.head())
print(filtered_data.shape)

shape: (5, 88)
┌────────────┬────────┬────────────┬────────────┬───┬────────────┬───────────┬─────────┬───────────┐
│ name       ┆ object ┆ id         ┆ oracle_id  ┆ … ┆ hand_modif ┆ content_w ┆ defense ┆ __index_l │
│ ---        ┆ ---    ┆ ---        ┆ ---        ┆   ┆ ier        ┆ arning    ┆ ---     ┆ evel_0__  │
│ str        ┆ str    ┆ str        ┆ str        ┆   ┆ ---        ┆ ---       ┆ f64     ┆ ---       │
│            ┆        ┆            ┆            ┆   ┆ f64        ┆ f64       ┆         ┆ i64       │
╞════════════╪════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═════════╪═══════════╡
│ Archon of  ┆ card   ┆ 00174be7-0 ┆ 180efbd4-0 ┆ … ┆ null       ┆ null      ┆ null    ┆ 59        │
│ the Wild   ┆        ┆ dc8-43b9-8 ┆ f77-41cc-8 ┆   ┆            ┆           ┆         ┆           │
│ Rose       ┆        ┆ 1b6-f25a8c ┆ ae2-2e6ccf ┆   ┆            ┆           ┆         ┆           │
│            ┆        ┆ …          ┆ …          ┆   ┆            ┆          

###SAVE PARQUETS WITH NEW EXPANSIONS

In [108]:
expansion_list=['woe','lci','mkm','otj','blb','dsk','dft','tdm','fin']
df_dict={
    expansion: filter_set(scryfall_data, expansion) for expansion in expansion_list
}

In [109]:
#save parquets
output_dir = pathlib.Path(folder, scryfall_foldername,'card-sets')
# Create the directory if it doesn't exist
if not output_dir.exists():
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

for expansion, df in df_dict.items():
  output_filename = pathlib.Path(output_dir, f'{expansion}_cards.parquet')
  df.write_parquet(output_filename)
  print(f"Saved {expansion} data to {output_filename}")

Saved woe data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/woe_cards.parquet
Saved lci data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/lci_cards.parquet
Saved mkm data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/mkm_cards.parquet
Saved otj data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/otj_cards.parquet
Saved blb data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/blb_cards.parquet
Saved dsk data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/dsk_cards.parquet
Saved dft data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/dft_cards.parquet
Saved tdm data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/tdm_cards.parquet
Saved fin data to /content/drive/MyDrive/Erdos25/MTGdraft/Scryfall-data/card-sets/fin_cards.parquet
