In [1]:
#!pip install tokenizers

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
!git clone https://github.com/cgjeong23/Viral-genomic-classification.git virus

## Imports

In [4]:
from virus.ML.model import RnnModel, RnnModelForClassification
from virus.ML.train import train, evaluate
from virus.ML.dataloader import load_sequences, sample_data, get_3_splits, SequenceDataset
from torch import nn
from torch.utils.data import DataLoader

import numpy as np
import os

%load_ext autoreload
%autoreload 2

# Do Training

In [5]:
use_google_drive = False
use_kaggle = True

In [6]:
google_drive_path = '/content/drive/MyDrive'

if use_google_drive:
    base_path = f'{google_drive_path}/trainingdata'
    tokenizer_file = f'{google_drive_path}/gene_tokenizer.json' 
elif use_kaggle:
    base_path = '../input/pacific-sra/trainingdata'
    tokenizer_file = '../input/pacific-sra/gene_tokenizer.json'
else:
    base_path = 'trainingdata'
    tokenizer_file = 'gene_tokenizer.json'

sequences, labels = load_sequences(base_path)

(train_seq, valid_seq, test_seq,
 train_label, valid_label, test_label) = get_3_splits(sequences, labels)

label_dict = {k: i for i, k in enumerate(np.unique(labels))}

train_dataset = SequenceDataset(train_seq, train_label, tokenizer_file=tokenizer_file,
                                label_dict=label_dict)
valid_dataset = SequenceDataset(valid_seq, valid_label, tokenizer_file=tokenizer_file,
                                label_dict=label_dict)
test_dataset = SequenceDataset(test_seq, test_label, tokenizer_file=tokenizer_file,
                               label_dict=label_dict)

In [7]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weight = compute_class_weight('balanced', classes=np.unique(train_label), y=train_label)

In [8]:
from collections import Counter

Counter(labels)

In [9]:
lr = 1e-3
batch_size = 5000
num_epochs = 3
vocab_size = train_dataset.tokenizer.get_vocab_size()
pad_id = train_dataset.tokenizer.padding['pad_id']
embedding_dim = 256
hidden_dim = 512
num_layers = 1

pretrained_emb_path = 'emb.pt'
freeze = True

In [10]:
# load pretrained_embedding
import torch
from virus.ML.model import SkipGramEmbeddingModel

if pretrained_emb_path is not None and os.path.isfile(pretrained_emb_path):
    emb_model = SkipGramEmbeddingModel(vocab_size, embedding_dim, pad_id, 2)
    emb_model.load_state_dict(torch.load(pretrained_emb_path))
    pretrained_emb = emb_model.embedding.weight
else:
    print(f"We could not load from {pretrained_emb_path}.")
    pretrained_emb = None

In [11]:
#model = RnnModel(vocab_size, embedding_dim, pad_id, hidden_dim, num_layers)

model = RnnModelForClassification(vocab_size, embedding_dim, pad_id, hidden_dim, num_layers, 
len(train_dataset.label_dict), pretrained_emb=pretrained_emb, freeze=freeze)
model = model.to('cuda')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=valid_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=test_dataset.collate_fn)
loss_function = nn.CrossEntropyLoss(weight=torch.Tensor(class_weight).to("cuda"))

In [12]:
kaggle_path = '/kaggle/working'

if use_kaggle:
    save_path = kaggle_path
elif use_google_drive:
    save_path = google_drive_path
else:
    save_path = '.'

acc_history = train(model, train_loader, loss_function, lr, num_epochs, 
                    valid_loader=valid_loader, test_loader=test_loader,
                    base_path=save_path)

In [13]:
acc_history

In [14]:
import matplotlib.pyplot as plt

plt.plot(acc_history['train'], label='train')
plt.plot(acc_history['valid'], label='valid')
plt.plot(acc_history['test'], label='test')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend()