# Clustering

**This workbook conducts a clustering analysis on a specified set of card-vectors**

In [None]:
set_abbreviation = 'one'

In [None]:
from sklearn.manifold import TSNE
from gensim.models import Word2Vec, KeyedVectors
import altair as alt
import pandas as pd
import SetTools
import json
import os

### Load the model containing the card vectors of interest

In [None]:
model_name = 'ONE.model'

In [None]:
cur_dir = os.getcwd()
model_path =  f'{cur_dir}/models/{set_abbreviation}/{model_name}'
model = Word2Vec.load(model_path)

In [None]:
# we only want the KeyedVector of card-embeddings, so we can discard the rest of the model
card_vectors_keyed = model.wv
del(model)

#get the card vectors themselves by index (card name)
card_vectors = card_vectors_keyed[card_vectors_keyed.index_to_key]

### Load card metadata from Scryfall api

In [None]:
cards = SetTools.scryfall_card_details(set_abbreviation)

In [None]:
scry_dir = os.getcwd() + '/scryfall/' + set_abbreviation

# Create the local directory if it doesn't exist
if not os.path.exists(scry_dir): os.makedirs(scry_dir)

with open(f'{scry_dir}/{set_abbreviation}.json', 'w') as f:
    json.dump(cards, f)

In [None]:
cards_df = pd.DataFrame(cards).transpose()
cards_df.rename(columns={'url':'image'}, inplace=True)

### Calculate t-SNE outputs

In [None]:
#Instantiate t-SNE model and run it on card_vectors
tsne_model = TSNE(n_components=2)
tsne_out = tsne_model.fit_transform(card_vectors)

In [None]:
# Create a DataFrame from t-SNE outputs and join with our card metadata
tsne_df = pd.DataFrame(tsne_out, index=card_vectors_keyed.index_to_key, columns = ['tsne_1','tsne_2']) # arbitrary names for t-SNE components
tsne_df['card'] = card_vectors_keyed.index_to_key

cards_df = cards_df.merge(right=tsne_df, how='inner', left_index=True, right_index=True)

### Visualize t-SNE clusters
We see that cards are primarily clustered by colour

In [None]:
alt.Chart(data=cards_df).mark_point().encode(
    x='tsne_1',
    y='tsne_2',
    tooltip=['image']
).interactive()

## Rough Work

In [None]:
# example operations
# model.wv.most_similar('Black Market Tycoon', topn=5)
# vect = model.wv['Call In a Professional'] - model.wv['Mountain'] + model.wv['Swamp']
# model.wv.similar_by_vector(vect)

### What do clusters look like if we subtract basic land embeddings from mono-colour cards?

In [None]:
cards_df.filter(regex=("type"))

In [None]:
embeddings_df = pd.DataFrame(card_vectors, index=card_vectors_keyed.index_to_key)

In [None]:
lands = cards_df['type_line'].str.contains('Land| land')

In [None]:
cards_df[lands]['color_identity']

In [None]:
cards_df[lands]['type_line']

In [None]:
no_colour = embeddings_df.copy()

In [None]:
cards_df.loc['Cemetery Tampering']['color_identity']

### Steps:
* find nearest land to each card
    * create lands_df which is filtered cards_df to just lands
    * create dataframe with 1 row for each card, in which for each card name in embeddings, it contains the land vector to subtract
    * subtract
    * but for colorless cards, replace original vector
* t-SNE and plot
* filter to mono-colour cards and test again
    * could make color_identity into dummies to simplify

In [None]:
lands_df = cards_df[cards_df['type_line'].str.contains('Land| land')].copy()

In [None]:
card_vectors_keyed[lands].shape

In [None]:
KeyedVectors()

In [None]:
embeddings_df[lands]

In [None]:
embeddings_df[lands_df]

In [None]:
for row in embeddings_df:
    