In [16]:
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## DataLoader

In [18]:
import os
import math

from tokenizers import Tokenizer, models

base_path = '/content/drive/MyDrive/trainingdata'

In [19]:
import numpy as np

def load_sequences(base_path):
    sequences = []
    labels = []
    for folder in os.listdir(base_path):
        if os.path.isdir(f'{base_path}/{folder}'):
            for file_name in os.listdir(f'{base_path}/{folder}'):
                fname = f'{base_path}/{folder}/{file_name}'
                with open(fname) as f:
                    for line in f:
                        if line.startswith('>'):
                            continue
                        sequences.append(line)
                        labels.append(folder)
    return sequences, labels

def sample_data(sequences, labels):
    idx = np.random.choice(len(labels), int(len(labels) * 0.5))

    seqeunces = [sequences[i] for i in idx]
    labels = [labels[i] for i in idx]

    return sequences, labels


In [20]:
from sklearn.model_selection import train_test_split

def get_3_splits(sequences, labels):
    (train_sequence, 
     test_sequence, 
     train_label, 
     test_label) = train_test_split(sequences, labels, test_size=0.2)

    (valid_sequence, 
     test_sequence, 
     valid_label,
     test_label) = train_test_split(test_sequence, test_label, test_size=0.5)

    return (train_sequence, valid_sequence, test_sequence, 
            train_label, valid_label, test_label)


In [28]:
from torch.utils.data import Dataset, DataLoader
import torch
from torch.nn.utils.rnn import pad_sequence

class SequenceDataset(Dataset):
    def __init__(self, sequence, labels, tokenizer_file='gene_tokenizer.json',
                 label_dict=None):
        """sequence: List of Str
        
        ["ACTG...", "GTCA...", ...]
        """
        self.sequence = sequence
        if label_dict is None:
            self.label_dict = self.get_label_dict(labels)
        else:
            self.label_dict = label_dict
        self.labels = self.encode_labels(labels)

        self.tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
        self.tokenizer = self.tokenizer.from_file(tokenizer_file)
        self.tokenizer.enable_padding()

    def get_label_dict(self, labels):
        label_set = set(labels)
        label_dict = {}
        for i, x in enumerate(label_set):
            label_dict[x] = i
        
        return label_dict
    
    def encode_labels(self, labels):
        encoded_label = []  
        for y in labels:
            encoded_label.append(self.label_dict[y])

        return encoded_label
            
        
    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, idx):
        seq = self.sequence[idx]
        label = self.labels[idx]
        encoded_seq = self.tokenizer.encode(seq)
        return torch.LongTensor(encoded_seq.ids), label


    def collate_fn(self, batch):
        """batch: list of (torch.LongTensor, int)"""
        sequences = []
        labels = []
        for item in batch:
            sequences.append(item[0])
            labels.append(item[1])
            
        sequence = pad_sequence(sequences, batch_first=True, padding_value=self.tokenizer.padding['pad_id'])
        labels = torch.LongTensor(labels)
        
        return sequence, labels


## Model

In [29]:
from torch import nn

class RnnModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pad_id, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_id)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.out_layer = nn.Linear(hidden_dim, vocab_size)

    def forward(self, ids):
        # ids: [batch size, max sequence length] = [B, L]
        embedded = self.embedding(ids)  # [B, L, E]
        rnn_out, _ = self.rnn(embedded)  # [B, L, H]
        return self.out_layer(rnn_out)  # [B, L, V]

class RnnModelForClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pad_id, hidden_dim, num_layers, output_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_id)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.out_layer = nn.Linear(hidden_dim, output_size)
    
    def forward(self, ids):
        # ids: [batch size, max sequence length] = [B, L]
        embedded = self.embedding(ids)  # [B, L, E]
        rnn_out, _ = self.rnn(embedded)  # [B, L, H]
        hidden_state = rnn_out[:, -1, :]  # [B, H]
        return self.out_layer(hidden_state)  # [B, C]


## Training

In [30]:
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

def train(model, dataloader, loss_function, lr, num_epochs, valid_loader=None,
          test_loader=None):
    os.makedirs('/content/drive/MyDrive/GeneModels', exist_ok=True)
    # pytorch training loop
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    history = {'train': [], 'valid': [], 'test': []}
    for epoch in range(num_epochs):
        pbar = tqdm(dataloader)

        all_preds = []
        all_labels = []
        for batch in pbar:

            batch_sequences, y = batch
            x = batch_sequences.to('cuda')
            y = y.to('cuda')

            h = model(x)  # [B, C]
            j = loss_function(h, y)
            
            # do gradient descent
            optimizer.zero_grad()  # remove junk from last step
            j.backward()   # calculate gradient from current batch outputs
            optimizer.step()  # update the weights using the gradients

            all_preds.append(h.argmax(-1).detach().cpu())
            all_labels.append(y.cpu())

        all_preds = torch.cat(all_preds).numpy()
        all_labels = torch.cat(all_labels).numpy()
    
        print(classification_report(all_labels, all_preds, digits=4))
        accuracy = accuracy_score(all_labels, all_preds)

        if valid_loader is not None:
            val_accuracy = evaluate(valid_loader)
            history['valid'].append(val_accuracy)
        if test_loader is not None:
            test_accuracy = evaluate(test_loader)
            history['test'].append(test_accuracy)

        history['train'].append(accuracy)

        torch.save(model.state_dict(), f'/content/drive/MyDrive/GeneModels/{epoch}.pth')
        
        

    return history


def evaluate(valid_loader):
    valid_preds = []
    valid_labels = []
    for batch in valid_loader:
        batch_sequences, y = batch
        x = batch_sequences.to('cuda')
        y = y.to('cuda')
        h = model(x)  # [B, C]
        valid_preds.append(h.argmax(-1).detach().cpu())
        valid_labels.append(y.cpu())
    valid_preds = torch.cat(valid_preds).numpy()
    valid_labels = torch.cat(valid_labels).numpy()

    print(classification_report(valid_labels, valid_preds, digits=4))
    accuracy = accuracy_score(valid_labels, valid_preds)
    return accuracy

# Do Training

In [31]:
tokenizer_file = '/content/drive/MyDrive/gene_tokenizer.json'

sequences, labels = load_sequences(base_path)

(train_seq, valid_seq, test_seq,
 train_label, valid_label, test_label) = get_3_splits(sequences, labels)

train_dataset = SequenceDataset(train_seq, train_label, tokenizer_file=tokenizer_file)
valid_dataset = SequenceDataset(valid_seq, valid_label, tokenizer_file=tokenizer_file,
                                label_dict=train_dataset.label_dict)
test_dataset = SequenceDataset(test_seq, test_label, tokenizer_file=tokenizer_file,
                               label_dict=train_dataset.label_dict)

In [32]:
lr = 1e-4
batch_size = 5000
num_epochs = 1
vocab_size = train_dataset.tokenizer.get_vocab_size()
pad_id = train_dataset.tokenizer.padding['pad_id']
embedding_dim = 256
hidden_dim = 512
num_layers = 1

In [33]:
#model = RnnModel(vocab_size, embedding_dim, pad_id, hidden_dim, num_layers)
model = RnnModelForClassification(vocab_size, embedding_dim, pad_id, hidden_dim, num_layers, len(train_dataset.label_dict))
model = model.to('cuda')
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=valid_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=test_dataset.collate_fn)
loss_function = nn.CrossEntropyLoss()

In [None]:
acc_history = train(model, train_loader, loss_function, lr, num_epochs, 
                    valid_loader=valid_loader, test_loader=test_loader)

In [None]:
acc_history

In [None]:
import matplotlib.pyplot as plt

plt.plot(acc_history)