In [None]:
import torch
import numpy as np

from inference import infer, load_for_inference
from dataloader import load_sequences

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

sequence, label = load_sequences('trainingdata')

label_index = {k: [] for k in set(label)}

for i, l in enumerate(label):
    label_index[l].append(i)

label_index = {k: np.random.choice(v, 100000) for k, v in label_index.items()}

sampled_index = np.concatenate(list(label_index.values()))

sequence = [sequence[i] for i in sampled_index]
label = [label[i] for i in sampled_index]

model, tokenizer, label_dict = load_for_inference('assets/model.pth','assets/gene_tokenizer.json','assets/label_dict.json',
 skip_gram=True)

model = model.to(device)

batch_size = 1000
num_batches = len(sequence) // batch_size
if len(sequence) % batch_size != 0:
    num_batches += 1

all_embeddings = []
for i in range(num_batches):
    emb = infer(sequence[i * batch_size: (i+1) * batch_size], tokenizer, model.embedding, device=device)
    all_embeddings.append(emb)

all_embeddings = torch.cat(all_embeddings, 0).numpy()

np.save('assets/virus_embeddings.npy', all_embeddings)
with open('assets/label.pkl','wb') as f:
    pickle.dump(label, f)

#PCA

from sklearn.decomposition import PCA
import pickle

pca = PCA(n_components=3) # reduce to 3d
pca.fit(all_embeddings)
embeddings_3d = pca.transform(all_embeddings)

np.save('assets/virus_embeddings_3d.npy',embeddings_3d)

with open('assets/virus_pca.pkl','wb') as f:
    pickle.dump(f,pca)