https://towardsdatascience.com/multiclass-text-classification-using-lstm-in-pytorch-eac56baed8df

https://www.kaggle.com/deshwalmahesh/nlp-beginner-1-rnn-lstm-gru-embeddings-glove

In [1]:
from tqdm import tqdm, tqdm_notebook
import numpy as np
import os

from pathlib import Path

import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from google_drive_downloader import GoogleDriveDownloader as gdd
from sklearn.feature_extraction.text import CountVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook

random_state = 4
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import json
import time

from utils import *
from models import *
from training import *

In [2]:
path = 'data/16S-SG-reads.fa/16S-reads.fa'
#Preprocessing
# Loading, Cleaning and Labelling 
sg, tax_dict = preprocessing(path, size = 8)
# Encoding
sg_encoded = encoder(sg)

# Dataset

#### Train and Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(np.array(list(sg_encoded['encoded'])), 
                                                    np.array(list(sg_encoded['labels'])), 
                                                    test_size = 0.10, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state=42)

In [5]:
len(X_train), len(X_val), len(X_test)

(25401, 1411, 1412)

In [9]:
#Dataset
train = Dataset(X_train, y_train)
val = Dataset(X_val, y_val)
test = Dataset(X_test, y_test)

#DataLoader
train_loader = DataLoader(train, batch_size = 64, shuffle=True)
val_loader = DataLoader(val, batch_size = 256, shuffle=True)
test_loader = DataLoader(test, batch_size = 256, shuffle=True)

## Modeling

In [24]:
with open('vocab/word2idx.json') as json_file:
    word2idx = json.load(json_file)
hidden_size = 256
in_neuron = len(word2idx)
padding_idx = word2idx['pad']
loss_fn = nn.CrossEntropyLoss()

cuda = torch.cuda.is_available()
lr = 0.001

In [25]:
model_class = Network(in_neuron, padding_idx, hidden_size = hidden_size, 
                      out_neuron = tax_dict['Class'][-1], num_layers = 2,
                     bidirectional = True)

In [26]:
model_class

Network(
  (embedding): Embedding(74320, 128)
  (lstm): LSTM(128, 256, num_layers=2, bidirectional=True)
  (dropout): Dropout(p=0.13, inplace=False)
  (dense): Linear(in_features=512, out_features=3, bias=True)
)

In [27]:
if cuda:
    model_class.to('cuda')
optimizer = torch.optim.Adam(model_class.parameters(),lr=lr)

EPOCH = 20
for epoch in range(EPOCH):
    train_loss, train_acc = train_network(model_class, train_loader, optimizer, loss_fn, epoch+1, tax = 'Class')
    val_loss,val_acc = evaluate_network(model_class, test_loader, optimizer,loss_fn, tax = 'Class')
    tqdm.write(f'''End of Epoch: {epoch+1}  |  Train Loss: {train_loss:.3f}  |  Val Loss: {val_loss:.3f}  |  Train Acc: {train_acc*100:.2f}%  |  Val Acc: {val_acc*100:.2f}%''')

Epoch: 1:   1%|▎                                                                       | 2/397 [00:00<00:46,  8.43it/s]

Batch : 0/397   Loss : 1.095173954963684   Accuracy : 0.5


Epoch: 1:  13%|█████████▎                                                             | 52/397 [00:05<00:37,  9.20it/s]

Batch : 50/397   Loss : 0.884985625743866   Accuracy : 0.5


Epoch: 1:  26%|█████████████████▉                                                    | 102/397 [00:11<00:33,  8.91it/s]

Batch : 100/397   Loss : 0.9168680310249329   Accuracy : 0.71875


Epoch: 1:  38%|██████████████████████████▊                                           | 152/397 [00:16<00:28,  8.69it/s]

Batch : 150/397   Loss : 0.3723166584968567   Accuracy : 0.890625


Epoch: 1:  51%|███████████████████████████████████▌                                  | 202/397 [00:22<00:21,  9.04it/s]

Batch : 200/397   Loss : 0.2491542100906372   Accuracy : 0.890625


Epoch: 1:  63%|████████████████████████████████████████████▍                         | 252/397 [00:28<00:15,  9.19it/s]

Batch : 250/397   Loss : 0.17124351859092712   Accuracy : 0.9375


Epoch: 1:  76%|█████████████████████████████████████████████████████▏                | 302/397 [00:33<00:10,  9.20it/s]

Batch : 300/397   Loss : 0.21080374717712402   Accuracy : 0.890625


Epoch: 1:  89%|██████████████████████████████████████████████████████████████        | 352/397 [00:38<00:04,  9.21it/s]

Batch : 350/397   Loss : 0.19441968202590942   Accuracy : 0.921875


Epoch: 1: 100%|██████████████████████████████████████████████████████████████████████| 397/397 [00:43<00:00,  9.06it/s]
Epoch: 2:   0%|▏                                                                       | 1/397 [00:00<00:43,  9.01it/s]

End of Epoch: 1  |  Train Loss: 0.422  |  Val Loss: 0.118  |  Train Acc: 81.32%  |  Val Acc: 94.30%
Batch : 0/397   Loss : 0.0191377904266119   Accuracy : 1.0


Epoch: 2:  13%|█████████▎                                                             | 52/397 [00:05<00:37,  9.13it/s]

Batch : 50/397   Loss : 0.06191684678196907   Accuracy : 1.0


Epoch: 2:  26%|█████████████████▉                                                    | 102/397 [00:11<00:33,  8.70it/s]

Batch : 100/397   Loss : 0.042649295181035995   Accuracy : 0.984375


Epoch: 2:  38%|██████████████████████████▊                                           | 152/397 [00:17<00:26,  9.10it/s]

Batch : 150/397   Loss : 0.041009869426488876   Accuracy : 1.0


Epoch: 2:  51%|███████████████████████████████████▌                                  | 202/397 [00:22<00:22,  8.79it/s]

Batch : 200/397   Loss : 0.08266737312078476   Accuracy : 0.984375


Epoch: 2:  63%|████████████████████████████████████████████▍                         | 252/397 [00:28<00:16,  8.78it/s]

Batch : 250/397   Loss : 0.011071768589317799   Accuracy : 1.0


Epoch: 2:  76%|█████████████████████████████████████████████████████▏                | 302/397 [00:33<00:10,  8.78it/s]

Batch : 300/397   Loss : 0.011874629184603691   Accuracy : 1.0


Epoch: 2:  89%|██████████████████████████████████████████████████████████████        | 352/397 [00:39<00:04,  9.18it/s]

Batch : 350/397   Loss : 0.04901648685336113   Accuracy : 0.984375


Epoch: 2: 100%|██████████████████████████████████████████████████████████████████████| 397/397 [00:44<00:00,  8.93it/s]
Epoch: 3:   0%|▏                                                                       | 1/397 [00:00<00:43,  9.01it/s]

End of Epoch: 2  |  Train Loss: 0.046  |  Val Loss: 0.041  |  Train Acc: 98.83%  |  Val Acc: 98.31%
Batch : 0/397   Loss : 0.014889195561408997   Accuracy : 1.0


Epoch: 3:  13%|█████████▎                                                             | 52/397 [00:05<00:38,  8.95it/s]

Batch : 50/397   Loss : 0.013180038891732693   Accuracy : 1.0


Epoch: 3:  15%|██████████▌                                                            | 59/397 [00:06<00:38,  8.80it/s]


KeyboardInterrupt: 

In [None]:

out_class = 4
model_class = Network(in_neuron, padding_idx, hidden_size = hidden_size, out_neuron = out_class)

In [None]:
model_ = Network(in_neuron, padding_idx, hidden_size = hidden_size, out_neuron = out_class)

In [None]:
model_.load_state_dict(model_class.state_dict())

In [None]:
out_order = 16
model_.dense = torch.nn.Linear(hidden_size * 2,out_order)

In [None]:
model_