In [None]:
from google.colab import drive
drive.mount('/content/drive/')

directory = '/content/drive/MyDrive/Colab Notebooks/data'

Mounted at /content/drive/


In [1]:
import pandas as pd
import numpy as np
import regex as re
import gzip
import csv
import json
import torch
import tensorflow as tf
import pickle
import requests
import re
pd.set_option('display.max_rows', 500)

In [None]:
# only a single epoch of 50 of those datasets was used

# load very small subset of csv
# need to create nbr of cards in pack x 3 torch.Tensor for each pick (line)
# ask how to make model train directly on data instead of creating file for it, for efficiency/storage issues

# 0. Create new cols: anchors, positives, negatives
# 1. anchors, already created (pool_card), repeated over the number of cards in pack
#    just need to create new col and create one-hot tensor of it
# 2. positives, already created (pick), similar as for anchors
# 3. negatives, representation of each possible WRONG pick
# get inspired from their code
# ultimately, each pick/line should result into anywhere between 2 and 13 lines (training examples)

In [None]:
def create_card_dict(df, pathout):
  card_dict = dict()
  index = 0
  for card in df.columns[df.columns.str.startswith('pack_card')]:
    card_dict[card.replace('pack_card_', '')]=index
    index+=1
  with open(pathout, 'wb') as f:
            pickle.dump(card_dict,f)
            f.close()

In [None]:
#create_card_dict(df, "/content/drive/MyDrive/Colab Notebooks/data/card_dict.pt")

In [2]:
#loading card_dict
card_dict = pickle.load(open(r"Data\card_dictDMU.pt", "rb"))

In [3]:
#defining data_types per column and load data function
COLUMN_REGEXES = {
    re.compile(r'user_game_win_rate_bucket'): 'float16',
    re.compile(r'user_n_games_bucket'): 'int8',
    re.compile(r'draft_id'): 'str',
    re.compile(r'draft_time'): 'str',
    re.compile(r'expansion'): 'str',
    re.compile(r'event_type'): 'str',
    re.compile(r'event_match_wins'): 'int8',
    re.compile(r'event_match_losses'): 'int8',
    re.compile(r'pack_number'): 'int8',
    re.compile(r'pick_number'): 'int8',
    re.compile(r'pick'): 'str',
    re.compile(r'pick_maindeck_rate'): 'float16',
    re.compile(r'pick_sideboard_in_rate'): 'float16',

    re.compile(r'pool_.*'): 'int8',
    re.compile(r'pack_card_.*'): 'int8',
}

def load_data(filename):
    col_names = pd.read_csv(filename, nrows=0).columns
    data_types = {}
    for c in col_names:
        for (r, t) in COLUMN_REGEXES.items():
            if r.match(c):
                data_types[c] = t
    skipcols= ['draft_time',
               'event_type',
               'expansion',
               'event_match_wins',
               'event_match_losses',
               'user_n_games_bucket',
               'user_game_win_rate_bucket',
               'pick_maindeck_rate',
               'pick_sideboard_in_rate',
               'draft_id',
               #'pick_number',
               #'pack_number',
               'rank'
                ]
    df = pd.read_csv(
        filename,
        dtype=data_types,
        #nrows=100000,
        #skiprows=range(1, 5000000),
        chunksize=100000,
        usecols = lambda x: x not in skipcols
        #usecols = ['rank', 'pack_number', 'pick_number']
    )
      
    
    return df

In [16]:
#loading and sorting dataframe
data_chunks = []
for chunk in load_data(r"Data\finaltrain.csv"):
  data_chunks.append(chunk)
df = pd.concat(data_chunks, axis=0)
pick_col = np.array(df.columns[0])
sorted_cols = np.sort(df.columns[1:].tolist())
sorted_cols = np.insert(sorted_cols, 0, pick_col, axis=0)
df = df.loc[:, sorted_cols]

#creating anchor, positive and negative tensor columns
anchor_cols = df.columns[df.columns.str.startswith('pool_')]
df['anchors'] = df[anchor_cols].apply(lambda x: x.tolist(), axis=1)
pack_cols = df.columns[df.columns.str.startswith('pack_card')]
df['positives'] = df['pick'].apply(lambda x: str(card_dict[x]))
df["negatives"] = df[pack_cols].apply(lambda x: x.tolist(), axis=1)

#creating training dataframe (columns = anchors, positives, negatives)
df = df[["positives","negatives","anchors"]]

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5957238 entries, 0 to 5957237
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   positives  object
 1   negatives  object
 2   anchors    object
dtypes: object(3)
memory usage: 136.4+ MB


Creating data files to feed into mtg_dataset builder

In [18]:
df[0:]

Unnamed: 0,positives,negatives,anchors
0,171,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,94,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,207,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,206,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,200,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
5957233,176,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."
5957234,81,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."
5957235,73,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."
5957236,176,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."


In [19]:
def preprocess_data(df, card_dict, pathout):
  file_num = 0
  file_data = []
  for idx, row in df.iterrows():
      output = []
      positives = row['positives']
      #positives_col = row['positives']
      #positives = [str(i) for i, val in enumerate(positives_col) if val > 0]
    
      negatives_col = row['negatives']
      negatives = [str(i) for i, val in enumerate(negatives_col) if val > 0 and str(i) not in positives]
      #negatives = [str(i) for i, val in enumerate(negatives_col) if val > 0]
    
      anchors_col = row['anchors']
      anchors = []
      for i, val in enumerate(anchors_col):
          if val > 0:
              for j in range(val):
                  anchors.append(str(i))
      anchors = ",".join(anchors)
    
      for neg in negatives:
            output.append(f"{positives};{neg};{anchors}")
      file_data.extend(output)
      if len(file_data) >= 997920:
                with open(pathout+'train_data'+str(file_num)+'.pt','wb') as w:
                    pickle.dump(file_data[:997920],w)
                    file_num += 1
                    file_data = file_data[997920:]
                    w.close()

In [20]:
preprocess_data(df, card_dict, "training_data/")

Extra Stuff

In [None]:
#visualizing pack contents
row = 0
pack_card_cols = df.columns[df.columns.str.startswith('pack_card')]
pack = df.loc[row, pack_card_cols][df.loc[row, pack_card_cols] == 1]
pack

In [None]:
#creating anchor, positive and negative tensor columns
anchor_cols = df.columns[df.columns.str.startswith('pool_')]
df['anchors'] = df[anchor_cols].apply(lambda x: torch.tensor(x.values.tolist()), axis=1)
pack_cols = df.columns[df.columns.str.startswith('pack_card')]
df['positives'] = df.apply(lambda row: torch.tensor([int(row[col] if row['pick'] in col else 0) for col in pack_cols]), axis=1)
df["negatives"] = df[pack_cols].apply(lambda x: torch.tensor(x.values.tolist()), axis=1)
df["negatives"] = df["negatives"] - df["positives"]

In [None]:
# API request form taken from Ryan Saxe
def get_card_rating_data(expansion, endpoint=None, start="2022-09-01", end="2023-05-24", colors=None):
    if endpoint is None:
        endpoint = f"https://www.17lands.com/card_ratings/data?expansion={expansion.upper()}&format=PremierDraft"
        if start is not None:
            endpoint += f"&start_date={start}"
        if end is not None:
            endpoint += f"&end_date={end}"
        if colors is not None:
            endpoint += f"&colors={colors}"
    card_json = requests.get(endpoint).json()
    card_df = pd.DataFrame(card_json).fillna(0.0)
    #numerical_cols = card_df.columns[card_df.dtypes != object]
    #card_df["name"] = card_df["name"].str.lower()
    card_df = card_df.set_index("name")
    return card_df[["rarity", "color"]]

In [None]:
def replace_colors(match):
    color_mapping = {'G': 'Green', 'R': 'Red', 'B': 'Black', 'U': 'Blue', 'W': 'White'}
    colors = match.group(0)
    replacements = [color_mapping.get(color, color) for color in colors]
    return ', '.join(replacements)

def add_stats(rate_df, stats_df):
# adding basic lands
  basic_lands = pd.DataFrame({
    'rarity': ['C', 'C', 'C', 'C', 'C'],
    'color': ['', '', '', '', '']}, 
    index=['Plains', 'Island', 'Swamp', 'Mountain', 'Forest'])

  stats_df = pd.concat([stats_df, basic_lands])

# renaming rarities
  rarities = {'uncommon': 'U', 'rare': 'R', 'common': 'C', 'mythic': 'M', 'basic':'C'}
  stats_df["rarity"] = stats_df['rarity'].replace(rarities)

# apply color replacement
  stats_df['color'] = stats_df['color'].str.replace(r'[GWRBU]+', replace_colors, regex=True)
  stats_df['color'] = stats_df['color'].replace('', 'Colourless')

# final concat
  final_df = pd.concat([rate_df, stats_df], axis=1)
  return final_df

In [None]:
# Creating the pick rate file

def get_pickrate(df, path):
# Step 1: Count the number of times each card was picked
  pick_counts = df['pick'].value_counts().sort_index()

# Step 2: Count the number of times each card was possible to be picked
  pack_cols = df.columns[df.columns.str.startswith('pack_card_')]
  possible_counts = df[pack_cols].sum()
  possible_counts.index = possible_counts.index.str.replace('pack_card_', '')

# Step 3: Calculate the pick rate
  pick_rate = pick_counts / possible_counts

# Step 4: Create pickrate file
  rate_df = pd.concat([pick_rate.rename('pick_rate'), pick_counts.rename('pick_count')], axis=1)
  rate_df['pick_rate'] = rate_df['pick_rate'].fillna(0)
  rate_df['pick_count'] = rate_df['pick_count'].fillna(0)

# Step 5: Get rarity and color for graphics/tables
  stats_df = get_card_rating_data("DMU")
  final_df = add_stats(rate_df, stats_df)

# Step 6: Save to csv
  final_df.to_csv(path, index=True, header=False)

In [None]:
def compute_pick_chance(dataset):
  df = load_data(dataset)
  other_cols = np.array(df.columns[0:3])
  sorted_cols = np.sort(df.columns[3:].tolist())
  sorted_cols = np.insert(sorted_cols, 0, other_cols, axis=0)
  df = df.loc[:, sorted_cols]
  get_pickrate(df, '/content/drive/MyDrive/Colab Notebooks/data/pickratefull.csv')

In [None]:
# Create first pick rate file

def compute_firstpick_chance(dataset):
  df = load_data(dataset)
  other_cols = np.array(df.columns[0:3])
  sorted_cols = np.sort(df.columns[3:].tolist())
  sorted_cols = np.insert(sorted_cols, 0, other_cols, axis=0)
  df = df.loc[:, sorted_cols]
  df["pick_number"] = df["pick_number"].astype("int8")

# Step 1: Filter df to only get first picks (pack_number AND pick_number are both 0)
  filtered_df = df[(df['pack_number'] == 0) & (df['pick_number'] == 0)]
# Step 2: Get first pick rates
  get_pickrate(filtered_df, '/content/drive/MyDrive/Colab Notebooks/data/firstpickrate.csv')

In [None]:
compute_pick_chance('/content/drive/MyDrive/Colab Notebooks/data/draft_data_public.DMU.PremierDraft.csv')

In [None]:
compute_firstpick_chance('/content/drive/MyDrive/Colab Notebooks/data/draft_data_public.DMU.PremierDraft.csv')