In [1]:
import numpy as np
import pandas as pd
import mplcyberpunk as mcb # if this doesn't work, pip install mplcyberpunk
import matplotlib.pyplot as plt

plt.style.use('cyberpunk') # if this 

# Import the Data

In [73]:
df = pd.read_csv("mtg_data.csv", index_col=0)
df.head()

Unnamed: 0,id,name,cmc,layout,reserved,type_line,artist,booster,border_color,frame,...,has_kw_Vigilance,has_kw_Transform,has_kw_Cycling,has_kw_Haste,has_kw_Trample,has_kw_Mill,has_kw_Flash,has_kw_Scry,number_keywords,class_price
0,0000579f-7b35-4ed3-b44c-db2a538066fe,Fury Sliver,6.0,normal,False,Creature — Sliver,Paolo Parente,True,black,2003,...,0,0,0,0,0,0,0,0,0,0
1,0000579f-7b35-4ed3-b44c-db2a538066fe,Fury Sliver,6.0,normal,False,Creature — Sliver,Paolo Parente,True,black,2003,...,0,0,0,0,0,0,0,0,0,1
2,00006596-1166-4a79-8443-ca9f82e6db4e,Kor Outfitter,2.0,normal,False,Creature — Kor Soldier,Kieran Yanner,True,black,2003,...,0,0,0,0,0,0,0,0,0,0
3,00006596-1166-4a79-8443-ca9f82e6db4e,Kor Outfitter,2.0,normal,False,Creature — Kor Soldier,Kieran Yanner,True,black,2003,...,0,0,0,0,0,0,0,0,0,2
4,0000cd57-91fe-411f-b798-646e965eec37,Siren Lookout,3.0,normal,False,Creature — Siren Pirate,Chris Rallis,True,black,2015,...,0,0,0,0,0,0,0,0,2,0


In [3]:
df.dtypes

id                          object
name                        object
cmc                        float64
keywords                    object
layout                      object
reserved                      bool
type_line                   object
artist                      object
booster                       bool
border_color                object
frame                       object
full_art                      bool
promo                         bool
rarity                     float64
released_at                 object
reprint                       bool
set                         object
story_spotlight               bool
textless                      bool
variation                     bool
power_num                  float64
toughness_num              float64
is_foil                      int64
price                      float64
has_white                    int64
has_blue                     int64
has_black                    int64
has_red                      int64
has_green           

In [4]:

len(df)/len(df['artist'].unique().tolist())

66.88113542282673

In [5]:
print(f"Number of Cards: {len(df)}")
print("Columns:")
for c in df.columns:
    print(f"\t{c}")

Number of Cards: 113096
Columns:
	id
	name
	cmc
	keywords
	layout
	reserved
	type_line
	artist
	booster
	border_color
	frame
	full_art
	promo
	rarity
	released_at
	reprint
	set
	story_spotlight
	textless
	variation
	power_num
	toughness_num
	is_foil
	price
	has_white
	has_blue
	has_black
	has_red
	has_green
	total_num_colors
	len_oracle_text
	len_mana_types_produced
	has_frame_effect
	has_flavor_text
	class_price


# Analysis

In [6]:
uq, counts = np.unique(df.class_price, return_counts=True)
perc = counts / len(df)

for i in range(len(uq)):
    print(f"{uq[i]} : {perc[i]}")

0 : 0.5524421730211502
1 : 0.1826147697531301
2 : 0.17309188653886964
3 : 0.08382259319516164
4 : 0.008028577491688477


Looking at the printout above, we see that the majority class makes up for 55% of the data. Thus we want our model to achieve an accuracy greater than 55% in order for us to have meaningful success.

## Looking at Keywords

In [20]:
import re

In [54]:
all_k = set()
counts = dict()
for i in range(len(df)):
    kys = df.iloc[i]['keywords'].split(',')
    for k in kys:

        kw = re.sub(r'[^A-Za-z0-9 ]+', '', k)
        if kw == '':
            continue

        if kw[0] == ' ':
            kw = kw[1:]
        all_k.add(kw)

        # add counts
        if kw in counts.keys():
            counts[kw] += 1
        else:
            counts[kw] = 0

all_k = list(all_k)
print(len(all_k))

543


In [72]:
kw_counts = np.array(list(counts.values()))
kw_sorted = np.sort(kw_counts)[::-1]
num_keep = np.sum((kw_sorted / len(df) * 100) > 1.0)

mxkw = max(kw_sorted)
print([d for d in counts if counts[d] in kw_sorted[:num_keep]])

['Flying', 'Enchant', 'First strike', 'Equip', 'Vigilance', 'Transform', 'Cycling', 'Haste', 'Trample', 'Mill', 'Flash', 'Scry']


The above list shows all keywords that appear in at least 1% of the dataset. For now, we will keep these