In [1]:
import csv
import random
from itertools import islice

from gensim import corpora
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from tqdm import tqdm

In [2]:
class Corpus:
    def __iter__(self):
        with open("../../backend/collections.csv") as f:
            for record in csv.reader(f):
                random.shuffle(record)
                yield record

In [3]:
corpus = Corpus()
for record in islice(corpus, 1):
    print(record)

['Elvish Mystic', 'Venser, Shaper Savant', 'Deceiver Exarch', 'Worldly Tutor', 'Aluren', 'Gitaxian Probe', 'Sea Gate Oracle', 'Eternal Witness', 'Utopia Sprawl', 'Slithermuse', 'Pestermite', 'Purphoros, God of the Forge', 'Wood Elves', 'Wild Cantor', 'Fyndhorn Elves', 'Elvish Visionary', 'Sylvan Ranger', 'Trinket Mage', 'Dream Stalker', 'Chord of Calling', 'Kozilek, Butcher of Truth', 'Genesis Hydra', 'Prophet of Kruphix', 'Sol Ring', 'Peregrine Drake', 'Phantasmal Image', 'Daze', "Man-o'-War", 'Cloud of Faeries', 'Xenagos, the Reveler', 'Gurmag Drowner', 'Ulamog, the Infinite Gyre', 'Arbor Elf', 'Clone', 'Quickling', 'Fierce Empath', 'Domri Rade', 'Bloodbraid Elf', "Gaea's Herald", 'Coiling Oracle', 'Wild Growth', 'Birds of Paradise', 'Raven Familiar', 'Spellskite', 'Zealous Conscripts', 'Heartwood Storyteller', 'Fauna Shaman', 'Cloudstone Curio', 'Duplicant', 'Reclamation Sage', 'Glen Elendra Archmage', 'Mulldrifter', 'Animar, Soul of Elements', 'Birthing Pod', 'Glimpse of Nature', '

In [4]:
sum(1 for _ in corpus)

160108

In [5]:
dictionary = corpora.Dictionary(corpus)

In [7]:
class Callback(CallbackAny2Vec):
    def __init__(self, epochs):
        self.pbar = tqdm(total=epochs)

    def on_epoch_end(self, model):
        self.pbar.update()

    def close(self):
        self.pbar.close()


epochs = 100
callback = Callback(epochs)
model = Word2Vec(
    sentences=corpus,
    vector_size=128,
    window=60,
    min_count=0,
    workers=-1,
    epochs=100,
    shrink_windows=False,
    callbacks=[callback],
)
callback.close()

100%|█████████████████████████████████████████| 100/100 [01:51<00:00,  1.11s/it]


In [10]:
model.wv.most_similar("Island")

[('Mask of Griselbrand', 0.377899169921875),
 ('Hoodwink', 0.37130051851272583),
 ('Silent-Blade Oni', 0.35765111446380615),
 ('Cowed by Wisdom', 0.3400702476501465),
 ('Mist Dragon', 0.3340943157672882),
 ('Jwar Isle Avenger', 0.32883667945861816),
 ('Repel the Darkness', 0.3247365951538086),
 ('Aetherflux Reservoir\n\n\n#1_Threats', 0.3201090097427368),
 ('Kor Aeronaut', 0.3076937794685364),
 ("Ogre's Cleaver", 0.3044450581073761)]