In [1]:
import numpy as np

import gensim

from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec

from gensim.models.word2vec import LineSentence
from gensim.test.utils import datapath
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.doc2vec import TaggedLineDocument
from gensim import utils

import time
import os

import pyfastx

from ipywidgets import IntProgress
from IPython.display import display

import multiprocessing

import smart_open
import time

import sentencepiece as spm

In [2]:
#Given a sequence, returns a list of kmers.
def process_seq(seq, k):
    index = 0
    out = ""
    while index < len(seq) - k:
        out += "{0} ".format(seq[index: index + k])
        index += 1
    out += seq[index:index+k]
    return out
        

In [3]:
#Prints epochs for gensim models.
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.time = None
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
        self.time = time.time()
    def on_epoch_end(self, model):
        e = time.time() - self.time
        print("Epoch #{0} end in {1}".format(self.epoch, e))
        self.epoch += 1
        

In [6]:
#Cell for making "document" files from dna seqs.
# Document files are one-sequence-per-line text files, where sequences are white-space seperated lists of words.
# Words may be k-mers or BPE tokens.

### MACROS ###
write_kmers = False  #These booleans are here as a stop gap to avoid accidentally overwriting files later.
write_bpe = False
K = [6,8]

SP_FILE = '{}mer_compare_bpe_dna_wordsize_256.model'
FASTA_FILE = 'silva_nr_ref_unambiguous_as_dna.fasta'
##############

for k in K:
    sp = spm.SentencePieceProcessor()
    sp.Load(SP_FILE.format(k))

    fa = pyfastx.Fasta(FASTA_FILE)

    if write_kmers:
        prog = IntProgress(min=0, max=len(fa))
        display(prog)

        with open('all_16s_as_{0}mer_documents_no_embedding.txt'.format(k), 'w') as outfile:
            for record in fa:
                outfile.write("\n{}".format(process_seq(record.seq, k)))
                prog.value += 1

    if write_bpe:
        fa = pyfastx.Fasta(FASTA_FILE)
        prog = IntProgress(min=0, max=len(fa))
        display(prog)
        with open('all_16s_as_{0}mer_bpe_ws256_documents.txt'.format(k), 'w') as outfile:
            for record in fa:
                outfile.write("\n{}".format(" ".join(sp.encode_as_pieces(record.seq))))
                prog.value += 1

    del(fa)

In [7]:
for k in K:
    n_words = 4 ** k
    args = {'dm':0, 
            'vector_size': 128, 
            'epochs':20, 
            'min_count':1, 
            'sample': 0.0001, 
            'workers':multiprocessing.cpu_count()}



    trials = [('all_16s_as_{0}mer_documents_no_embedding.txt'.format(k), 'doc2vec_{}mers_128dim.model'.format(k)),
              ('all_16s_as_{0}mer_bpe_ws256_documents.txt'.format(k), 'doc2vec_{}bpe_128dim.model'.format(k))]


    #trials = [('all_16s_as_{0}mer_bpe_documents.txt'.format(k), 'doc2vec_{}bpe.model'.format(k))]
    for tup in trials:
        corpus_file = tup[0]
        outfile = tup[1]
        corpus= TaggedLineDocument(corpus_file)

        model = Doc2Vec(**args)

        print("Building Vocab....")
        s = time.time()
        model.build_vocab(corpus_file=corpus_file)
        e = time.time() - s
        print("Vocab built in {}".format(e))
        print("Training...")
        logger = EpochLogger()
        model.train(documents=corpus,
                    epochs=20,
                   total_examples=model.corpus_count,
                   total_words=n_words,
                   callbacks=[logger])
        model.save(outfile)
        print("done")


Building Vocab....
Vocab built in 140.50938034057617
Training...
Epoch #0 start
Epoch #0 end in 250.5376753807068
Epoch #1 start
Epoch #1 end in 245.91381812095642
Epoch #2 start
Epoch #2 end in 248.51491808891296
Epoch #3 start
Epoch #3 end in 254.4101481437683
Epoch #4 start
Epoch #4 end in 253.2412929534912
Epoch #5 start
Epoch #5 end in 253.78625392913818
Epoch #6 start
Epoch #6 end in 258.1486859321594
Epoch #7 start
Epoch #7 end in 248.08066749572754
Epoch #8 start
Epoch #8 end in 255.18641805648804
Epoch #9 start
Epoch #9 end in 252.50441336631775
Epoch #10 start
Epoch #10 end in 247.39678764343262
Epoch #11 start
Epoch #11 end in 248.78888750076294
Epoch #12 start
Epoch #12 end in 251.53982615470886
Epoch #13 start
Epoch #13 end in 254.1322901248932
Epoch #14 start
Epoch #14 end in 251.07536005973816
Epoch #15 start
Epoch #15 end in 253.61028170585632
Epoch #16 start
Epoch #16 end in 248.05555963516235
Epoch #17 start
Epoch #17 end in 254.22837257385254
Epoch #18 start
Epoch #1