## Set Up

Choose the MTG set and draft format (Premier or Traditional) to analyze

In [None]:
# Choose the MTG set. This is the 3 letter code for a set
EXPANSION = "LCI"

FORMAT_PREMIER_DRAFT = "PremierDraft"
FORMAT_TRADITIONAL_DRAFT = "TradDraft"

# Choose the format to analyze
FORMAT = FORMAT_PREMIER_DRAFT

### Download the draft dataset

Download the datasets. The datasets are from the [17 Lands Public Data Sets](https://www.17lands.com/public_datasets).

In [None]:
GAME_DATA_FILE = f"game_data_public.{EXPANSION}.{FORMAT}.csv.gz"
GAME_DATA_REMOTE_URL = f"https://17lands-public.s3.amazonaws.com/analysis_data/game_data/{GAME_DATA_FILE}"
DRAFT_DATA_FILE = f"draft_data_public.{EXPANSION}.{FORMAT}.csv.gz"
DRAFT_DATA_REMOTE_URL = f"https://17lands-public.s3.amazonaws.com/analysis_data/draft_data/{DRAFT_DATA_FILE}"

!wget {GAME_DATA_REMOTE_URL}
!wget {DRAFT_DATA_REMOTE_URL}
!wget https://17lands-public.s3.amazonaws.com/analysis_data/cards/cards.csv

### Import Pandas and Set Useful Options

Pandas is what we'll use to analyze the data. For more info on Pandas, see the [docs](https://pandas.pydata.org/docs/user_guide/10min.html).

In [None]:
import gzip
import pandas as pd

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', 1600)

### Explore The Data Set

There are 3 key datasets to explore. The main datasets are the draft and game data sets for an MTG set. Another dataset that can often be useful is the list of all cards on arena.

In [None]:
# Game Data
df = next(pd.read_csv(GAME_DATA_FILE, chunksize=100))
df.head(25)

In [None]:
# Draft Data
df = next(pd.read_csv(DRAFT_DATA_FILE, chunksize=100))
df.head(55)

In [None]:
cards_table = pd.read_csv("cards.csv")
set_cards = cards_table[cards_table["expansion"] == EXPANSION]
set_cards.head(25)

## Analyze Data

### Basic Helper Functions

Some useful functions for common ways to use the data sets

In [None]:
def get_all_drafts():
  """
  Returns a table with one row per draft
  """
  cols = ["expansion", "event_type", "draft_id", "draft_time", "rank", "event_match_wins", "event_match_losses"]
  chunks = list()
  for draft_data in pd.read_csv(
      DRAFT_DATA_FILE,
      chunksize=100000,
      usecols=cols
      ):
    draft_data_no_dups = draft_data.drop_duplicates(subset=["draft_id"])
    chunks.append(draft_data_no_dups)

  all_drafts = pd.concat(chunks)

  # Remove duplicates in case of drafts that show up in multiple chunks
  all_drafts = all_drafts.drop_duplicates(subset=["draft_id"], keep="last")
  return all_drafts

In [None]:
def get_game_data_cols():
  """
  Returns the columns in the game data file that includes metadata and which cards were in the deck. Filters out other columns to reduce size of the dataset
  """
  df = next(pd.read_csv(GAME_DATA_FILE, chunksize=100))
  col_names = list(df)
  gd_card_cols = [x for x in col_names if x.startswith("deck_")]

  gd_base_cols = ['draft_id', 'main_colors', 'splash_colors', 'user_n_games_bucket', 'user_game_win_rate_bucket']
  gd_all_cols = gd_base_cols + gd_card_cols
  return gd_all_cols

def get_all_decks():
  """
  Returns the last deck that was used in each draft.
  """
  gd_all_cols = get_game_data_cols()
  chunks = list()
  for game_data in pd.read_csv(
      GAME_DATA_FILE,
      chunksize=100000,
      usecols=gd_all_cols
      ):
    # Drop duplciates on draft id, keep the last
    game_data_no_dups = game_data.drop_duplicates(subset=["draft_id"], keep="last")
    chunks.append(game_data_no_dups)

  all_games = pd.concat(chunks)
  all_games_no_dups = all_games.drop_duplicates(subset=["draft_id"], keep="last")
  all_games_names_fixed = all_games_no_dups.rename(columns=lambda x: x[5:] if x.startswith("deck_") else x)
  return all_games_names_fixed

In [None]:
# This step may take a few minutes
all_decks = get_all_decks()
draft_metadata = get_all_drafts()
all_decks.head(25)