In [2]:
import pandas as pd
import numpy as np
from typing import Dict
import os
import re
# from google.colab import files
import time


## Download card-level data from 17lands

In [2]:
!wget https://17lands-public.s3.amazonaws.com/analysis_data/cards/cards.csv

--2024-05-22 10:47:22--  https://17lands-public.s3.amazonaws.com/analysis_data/cards/cards.csv
Resolving 17lands-public.s3.amazonaws.com (17lands-public.s3.amazonaws.com)... 54.231.233.41, 3.5.16.146, 52.216.28.100, ...
Connecting to 17lands-public.s3.amazonaws.com (17lands-public.s3.amazonaws.com)|54.231.233.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1039892 (1016K) [text/csv]
Saving to: ‘cards.csv’


2024-05-22 10:47:23 (8.30 MB/s) - ‘cards.csv’ saved [1039892/1039892]



## Get dtypes for draft data

In [4]:
# Helper function provided by 17lands.com to set proper datatypes for draft data files

def get_dtypes(filename: str, print_missing: bool = False) -> Dict[str, str]:
    dtypes: Dict[str, str] = {}
    for column in pd.read_csv(filename, nrows=0).columns:
        for regex, column_type in COLUMN_TYPES:
            if regex.match(column):
                dtypes[column] = column_type
                break
        else:
            if print_missing:
                print(f"Could not find an appropriate type for {column}")
    return dtypes

COLUMN_TYPES = (
    # Metadata
    (re.compile(r"^user_n_games_bucket$"), "int16"),
    (re.compile(r"^user_game_win_rate_bucket$"), "float"),
    (re.compile(r"^expansion$"), "str"),
    (re.compile(r"^event_type$"), "str"),
    (re.compile(r"^draft_id$"), "str"),
    (re.compile(r"^draft_time$"), "str"),
    (re.compile(r"^rank$"), "str"),
    # Draft
    (re.compile(r"^event_match_wins$"), "int8"),
    (re.compile(r"^event_match_losses$"), "int8"),
    (re.compile(r"^pack_number$"), "int8"),
    (re.compile(r"^pick_number$"), "int8"),
    (re.compile(r"^pick$"), "str"),
    (re.compile(r"^pick_maindeck_rate$"), "float"),
    (re.compile(r"^pick_sideboard_in_rate$"), "float"),
    (re.compile(r"^pool_.*"), "int8"),
    (re.compile(r"^pack_card_.*"), "int8"),
    # Game + Replay
    (re.compile(r"^game_time$"), "str"),
    (re.compile(r"^build_index$"), "int8"),
    (re.compile(r"^match_number$"), "int8"),
    (re.compile(r"^game_number$"), "int8"),
    (re.compile(r"^opp_rank$"), "str"),
    (re.compile(r"^main_colors$"), "str"),
    (re.compile(r"^splash_colors$"), "str"),
    (re.compile(r"^on_play$"), "bool"),
    (re.compile(r"^num_mulligans$"), "int8"),
    (re.compile(r"^opp_num_mulligans$"), "int8"),
    (re.compile(r"^opp_colors$"), "str"),
    (re.compile(r"^num_turns$"), "int8"),
    (re.compile(r"^won$"), "bool"),
    (re.compile(r"^deck_.*"), "int8"),
    (re.compile(r"^sideboard_.*"), "int8"),
    # Game
    (re.compile(r"^drawn_.*"), "int8"),
    (re.compile(r"^tutored_.*"), "int8"),
    (re.compile(r"^opening_hand_.*"), "int8"),
    # Replay
    (re.compile(r"^candidate_hand_\d$"), "str"),
    (re.compile(r"^opening_hand$"), "str"),
    (re.compile(r"^user_turn_\d+_cards_drawn$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_cards_discarded$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_lands_played$"), "str"),
    (re.compile(r"^user_turn_\d+_cards_foretold$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_creatures_cast$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_non_creatures_cast$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_((user)|(oppo))_instants_sorceries_cast$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_((user)|(oppo))_abilities$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_((user)|(oppo))_cards_learned$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_creatures_attacked$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_creatures_blocked$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_creatures_unblocked$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_creatures_blocking$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_creatures_blitzed$"), "int8"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_player_combat_damage_dealt$"), "str"),  # DEPRECATED
    (re.compile(r"^((user)|(oppo))_turn_\d+_((user)|(oppo))_combat_damage_taken$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_((user)|(oppo))_creatures_killed_combat$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_((user)|(oppo))_creatures_killed_non_combat$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_((user)|(oppo))_mana_spent$"), "float"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_eot_user_cards_in_hand$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_eot_oppo_cards_in_hand$"), "float"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_eot_((user)|(oppo))_lands_in_play$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_eot_((user)|(oppo))_creatures_in_play$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_eot_((user)|(oppo))_non_creatures_in_play$"), "str"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_eot_((user)|(oppo))_life$"), "float"),
    (re.compile(r"^((user)|(oppo))_turn_\d+_eot_((user)|(oppo))_poison_counters$"), "float"),
    (re.compile(r"^user_turn_\d+_cards_tutored$"), "str"),
    (re.compile(r"^oppo_turn_\d+_cards_tutored$"), "int8"),
    (re.compile(r"^oppo_turn_\d+_cards_drawn_or_tutored$"), "int8"),
    (re.compile(r"^oppo_turn_\d+_cards_drawn$"), "int8"),
    (re.compile(r"^oppo_turn_\d+_cards_foretold$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_cards_drawn$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_cards_discarded$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_lands_played$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_cards_foretold$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_creatures_cast$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_creatures_blitzed$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_non_creatures_cast$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_instants_sorceries_cast$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_cards_learned$"), "int8"),
    (re.compile(r"^((user)|(oppo))_total_mana_spent$"), "int16"),
    (re.compile(r"^oppo_total_cards_drawn_or_tutored$"), "int8"),
)


## Function to validate Wins/Losses

In [36]:
# Function to validate Wins/Losses in Draft Data with accurate values from Gameplay Data

def get_game_wins(set_name):
    # Load gameplay results data
    filename = f'game_data_public.{set_name}.PremierDraft.csv.gz'

    if not os.path.exists(filename):
        webpath = f'https://17lands-public.s3.amazonaws.com/analysis_data/game_data/game_data_public.{set_name}.PremierDraft.csv.gz'
        !wget {webpath}
        print(f"Downloaded {filename}.")
    else:
        print(f"{filename} already exists, no need to download.")

    datatypes = get_dtypes(filename)
    game_df = pd.read_csv(filename, usecols=[2,16,17], dtype=datatypes, compression='gzip')

    # Get a count of number of games played
    game_df_2 = game_df.groupby('draft_id').count().reset_index()
    # Get the sum of games won per draft_id
    game_wins = game_df.groupby('draft_id').sum().reset_index()['won']

    # Populate dataframe with corrected values for wins and losses
    game_df_2.rename(columns={'won':'games'}, inplace=True)
    game_df_2['wins'] = game_wins
    game_df_2['wins'] = game_df_2['wins'].clip(upper=7)  # Some MKM drafts logged 8 wins

    # Create dictionary of draft ID and corrected value of wins
    game_wins_dict = game_df_2.set_index('draft_id')['wins'].to_dict()
    return game_wins_dict


def map_game_wins(df, game_wins_dict):
    # Map corrected event wins to draft dataframe
    df['wins'] = df['draft_id'].map(game_wins_dict)
    # Drop rows where no games were ever recorded
    drop_rows = df.loc[df['wins'].isna()].index
    df.drop(drop_rows, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['wins'] = df['wins'].astype('int')
    return df


## Define Class to handle data pre-processing

In [31]:

class DataPreprocess:

    def __init__(self, df, set_name):
        self.set_name = set_name
        self.df = df
        self.ordered_data = None
        self.max_pack_length = df['pick_number'].max() + 1
        self.pack_cards_indices = []
        self.pool_cards_indices = []
        self.set_cards = None
        self.vocab_size = None

    def initialize(self):
        self.calculate_indices()
        self.load_set_cards()

    def calculate_indices(self):
        for i, col in enumerate(self.df.columns):
            if col.startswith('pack_card_'):
                self.pack_cards_indices.append(i)
            elif col.startswith('pool_'):
                self.pool_cards_indices.append(i)

    def load_set_cards(self):
        cards_df = pd.read_csv('cards_list.csv')
        df_pack_subset = self.df.iloc[:, self.pack_cards_indices]
        df_pack_subset.columns = df_pack_subset.columns.str.replace('pack_card_', '')
        card_names = df_pack_subset.columns.tolist()
        cards_df = cards_df[cards_df['name'].isin(card_names)]
        cards_df = cards_df.drop_duplicates(subset='name').sort_values(by='id').reset_index(drop=True)
        cards_df['token_id'] = cards_df.index + 2
        self.set_cards = cards_df[['name', 'token_id', 'id']]
        self.vocab_size = self.set_cards['token_id'].max()

    def process_data(self):
        print('Preprocessing Data...')
        self.ordered_data = self.order_draft_data()
        self.handle_pick_sequences()
        self.handle_pack_cards()
        # self.apply_padding()
        self.finalize_data()
        self.collate_data()
        print('Preprocessing Complete')

    def order_draft_data(self):
        card_mapper = {row.name: row.token_id for row in self.set_cards.itertuples()}
        complete_drafts = self.df.groupby('draft_id').count()['event_match_wins'] == (self.max_pack_length * 3)
        ordered_data = self.df[self.df['draft_id'].isin(complete_drafts.index)].sort_values(by=['draft_id', 'pack_number', 'pick_number'])
        ordered_data['pick_state'] = ordered_data['pack_number'] * self.max_pack_length + ordered_data['pick_number']
        ordered_data['pick_token'] = ordered_data['pick'].map(card_mapper)
        return ordered_data

    def handle_pick_sequences(self):
        full_sequence = {id: group['pick_token'].to_numpy() for id, group in self.ordered_data.groupby('draft_id')}
        self.ordered_data['pick_sequence'] = self.ordered_data['draft_id'].map(full_sequence)
        self.ordered_data['current_pool'] = self.ordered_data.apply(lambda row: row['pick_sequence'][:row['pick_state']], axis=1)

    def handle_pack_cards(self):
        card_name_to_id = {row['name']: row['token_id'] for _, row in self.set_cards.iterrows()}
        def extract_cards(row):
            cards = [name.replace('pack_card_', '') for name, value in row[self.df.columns[self.pack_cards_indices]].items() if value != 0]
            return [card_name_to_id.get(card, 0) for card in cards]
        self.ordered_data['available_cards'] = self.ordered_data.apply(extract_cards, axis=1)

    def apply_padding(self):
        max_length = self.ordered_data['current_pool'].apply(len).max()
        def pad_sequence(sequence):
            sequence = list(sequence)
            if len(sequence) == 0:
                return [1] + [0] * (max_length - 1)
            else:
                return sequence + [0] * (max_length - len(sequence))
        self.ordered_data['padded_pool'] = self.ordered_data['current_pool'].apply(pad_sequence)

    def insert_win_token(self, sequence, wins):
        a_len = sequence.shape[0]
        new_a = np.full(a_len + 1, (self.vocab_size + 1 + wins))
        new_a[1:] = sequence
        return new_a

    def finalize_data(self):
        self.ordered_data = self.ordered_data[self.ordered_data['available_cards'].apply(len) > 1]
        self.ordered_data['event_match_wins'] = self.ordered_data['event_match_wins'].astype('int')
        self.ordered_data['current_pool_with_wins'] = self.ordered_data.apply(lambda x: self.insert_win_token(x['current_pool'], x['event_match_wins']), axis=1)
        self.ordered_data.reset_index(drop=False, inplace=True)

    # def collate_data(self):
    #     self.df_cleaned = self.ordered_data[['draft_id','padded_pool', 'available_cards', 'pick_token', 'event_match_wins']].rename(columns={
    #         'padded_pool': 'cards_in_pool',
    #         'available_cards': 'cards_in_pack',
    #         'pick_token': 'card_selected',
    #         'event_match_wins': 'wins'
    #     })
    def collate_data(self):
        self.df_cleaned = self.ordered_data[['index','draft_id','current_pool','current_pool_with_wins', 'available_cards', 'pick_token', 'event_match_wins']].rename(columns={
            'current_pool': 'cards_in_pool',
            'available_cards': 'cards_in_pack',
            'pick_token': 'card_selected',
            'event_match_wins': 'wins'
        })


## Load raw draft data, or download directly from 17lands

In [162]:
# Specify the 3 letter set acronym

SET_NAME = 'MKM'

In [7]:

filename = f'draft_data_public.{SET_NAME}.PremierDraft.csv.gz'
gameplay_filename = f'game_data_public.{SET_NAME}.PremierDraft.csv.gz'

# Check if the file already exists, otherwise download it
if not os.path.exists(filename):
    webpath = f'https://17lands-public.s3.amazonaws.com/analysis_data/draft_data/draft_data_public.{SET_NAME}.PremierDraft.csv.gz'
    !wget {webpath}
    print(f"Downloaded {filename}.")
else:
    print(f"{filename} already exists, no need to download.")

if not os.path.exists(gameplay_filename):
    webpath = f'https://17lands-public.s3.amazonaws.com/analysis_data/game_data/game_data_public.{SET_NAME}.PremierDraft.csv.gz'
    !wget {webpath}
    print(f"Downloaded {gameplay_filename}.")
else:
    print(f"{gameplay_filename} already exists, no need to download.")


--2024-05-22 10:47:38--  https://17lands-public.s3.amazonaws.com/analysis_data/draft_data/draft_data_public.MKM.PremierDraft.csv.gz
Resolving 17lands-public.s3.amazonaws.com (17lands-public.s3.amazonaws.com)... 54.231.165.137, 52.217.167.153, 16.182.38.177, ...
Connecting to 17lands-public.s3.amazonaws.com (17lands-public.s3.amazonaws.com)|54.231.165.137|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 218519611 (208M) [text/csv]
Saving to: ‘draft_data_public.MKM.PremierDraft.csv.gz’


2024-05-22 10:47:45 (34.6 MB/s) - ‘draft_data_public.MKM.PremierDraft.csv.gz’ saved [218519611/218519611]

Downloaded draft_data_public.MKM.PremierDraft.csv.gz.
--2024-05-22 10:47:45--  https://17lands-public.s3.amazonaws.com/analysis_data/game_data/game_data_public.MKM.PremierDraft.csv.gz
Resolving 17lands-public.s3.amazonaws.com (17lands-public.s3.amazonaws.com)... 52.217.165.49, 16.182.97.41, 3.5.29.146, ...
Connecting to 17lands-public.s3.amazonaws.com (17lands-public.s3.amaz

## Prepare Card-Level data

In [7]:
SET_NAME = 'SIR'

In [10]:
# Import prepared card feature and oracle text files
card_features = pd.read_parquet('card_features_reduced.parquet')
oracle_texts = pd.read_parquet('oracle_text.parquet')

# Extract Set card list from DataPreprocessing Object
# filename = f'draft_data_public.{SET_NAME}.PremierDraft.csv.gz'
filename = 'draft_data_SIR\SIR_premier_draft_1.csv'
datatypes = get_dtypes(filename)
data_header = pd.read_csv(filename, dtype=datatypes, nrows=0)
card_head = DataPreprocess(data_header, set_name=SET_NAME)
card_head.initialize()
cards_df = card_head.set_cards

# Merge Set card list with imported features
cards_df = cards_df.merge(oracle_texts, left_on='name', right_on='name').drop(columns=['arena_id'], axis=1)
cards_df = cards_df.merge(card_features, left_on='name', right_on='name')

# # Export combined dataset to file
# out_filename = f'{SET_NAME}_cards.parquet' 
# cards_df.to_parquet(out_filename)

## Pre-process raw draft data in chunks, and save into smaller files

In [None]:
filename = f'draft_data_public.{SET_NAME}.PremierDraft.csv.gz'
datatypes = get_dtypes(filename)

# Get the length of full dataset
dataset_length = len(pd.read_csv(filename, dtype=datatypes, compression='gzip', index_col=False, usecols=[0]))

In [None]:
# Number of rows to per chunk
num_rows = 250000
num_chunks = dataset_length // num_rows

for chunk in range(num_chunks):
    t0 = time.time()
    # Load batch of rows from raw data file
    end_idx = (chunk * num_rows)
    data_chunk = pd.read_csv(filename, dtype=datatypes, compression='gzip',
                             nrows=num_rows, header=0, skiprows=range(1, end_idx+1))
    # Preprocess the batch
    processed = DataPreprocess(data_chunk, set_name=SET_NAME)
    processed.initialize()
    processed.process_data()

    # Save batch to file
    f_name = f'{SET_NAME}_preprocessed_{chunk}.parquet.gz'
    processed.df_cleaned.to_parquet(f_name, compression='gzip')
    t1 = round(time.time() - t0)
    print(f'Chunk {chunk} complete in {t1} seconds.')
    files.download(f_name)


Preprocessing Data...
Preprocessing Complete
Chunk 0 complete in 179 seconds.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Re-combine smaller files into final pre-processed dataset

In [None]:
# Load first file
pq_file = f'{SET_NAME}_preprocessed_0.parquet.gz'
pq_0 = pd.read_parquet(pq_file)

# Then iteratively append the rest of the files
for chunk in range(1 ,num_chunks):
  pq_next_file = f'{SET_NAME}_preprocessed_{chunk}.parquet.gz'
  pq_next = pd.read_parquet(pq_next_file)
  pq_0 = pd.concat([pq_0,pq_next])

# Save reconstructed dataset
pq_file_out = f'{SET_NAME}_all.parquet.gz'
pq_0.to_parquet(pq_file_out, compression='gzip')

* LTR draft length: 6,751,107 rows
* SIR draft length: 2,174,413 rows (all processed)
* MKM draft length: 6,180,337 rows (all processed)
* NEO draft length: 4,732,717 rows (all processed)

## Correct wins totals if necessary

In [67]:
filename = f'{SET_NAME}_all.parquet.gz'
out_file = f'{SET_NAME}_corrected.parquet.gz'

df = pd.read_parquet(filename)
game_wins = get_game_wins(SET_NAME)
df = map_game_wins(df, game_wins)
df.to_parquet(out_file)

game_data_public.MKM.PremierDraft.csv.gz already exists, no need to download.


## Validate Process

In [6]:
SET_NAME = 'SIR'

In [32]:
# Import prepared card feature and oracle text files
card_features = pd.read_parquet('card_features_reduced.parquet')
oracle_texts = pd.read_parquet('oracle_text.parquet')

# Extract Set card list from DataPreprocessing Object
# filename = f'draft_data_public.{SET_NAME}.PremierDraft.csv.gz'
filename = 'draft_data_SIR\SIR_premier_draft_1.csv'
datatypes = get_dtypes(filename)
data = pd.read_csv(filename, dtype=datatypes)


data_header = pd.read_csv(filename, dtype=datatypes, nrows=0)
card_head = DataPreprocess(data_header, set_name=SET_NAME)
card_head.initialize()
cards_df = card_head.set_cards

# Merge Set card list with imported features
cards_df = cards_df.merge(oracle_texts, left_on='name', right_on='name').drop(columns=['arena_id'], axis=1)
cards_df = cards_df.merge(card_features, left_on='name', right_on='name')

processed = DataPreprocess(data, set_name=SET_NAME)
processed.initialize()
processed.process_data()


Preprocessing Data...
Preprocessing Complete


In [60]:
processed.vocab_size
processed.set_cards[processed.set_cards['token_id']==358]

Unnamed: 0,name,token_id,id
356,Travel Preparations,358,86670


Unnamed: 0,name,token_id,id
38,Rattlechains,40,72228


In [94]:
processed = DataPreprocess(data, set_name=SET_NAME)
processed.initialize()
processed.process_data()


Preprocessing Data...
Preprocessing Complete


In [1]:
processed.df_cleaned.to_parquet('SIR_no_pad.parquet')

In [73]:
# df['pool_with_wins'] 

vocab_size = processed.set_cards['token_id'].max()

a = df['cards_in_pool'][3]
a_len = df['cards_in_pool'][3].shape[0]

np.full_like(df['cards_in_pool'][3], df['wins'][3])

new_a = np.full(a_len + 1, (vocab_size + 1 + df['wins'][3]))
new_a[1:] = a
new_a

array([362,  40,  96,  39], dtype=int64)

In [89]:
df['cards_in_pack'].to_numpy()

array([list([317, 212, 276, 83, 286, 36, 86, 22, 225, 132, 24, 198, 40, 173, 242]),
       list([321, 264, 186, 296, 154, 188, 127, 71, 24, 229, 96, 6, 203, 53]),
       list([214, 119, 21, 191, 196, 62, 356, 165, 39, 33, 102, 239, 142]),
       ..., list([157, 160, 267, 26]), list([46, 239, 274]),
       list([29, 102])], dtype=object)