In [1]:
from utils.neural_net_tools import *
from utils.leaky_lstm import *
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')


%load_ext autoreload
%autoreload 2

In [2]:
path_to_data = "dataset/text_molecule.csv"
format_ = "csv"
split_ratio = 5/6
min_freq_words = 1
batch_size = 128
seed = 1997
n_epochs = 30

# check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Normal label 1

In [3]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), ('label', LABEL),
          (None, None), (None, None), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

Wall time: 973 ms


In [4]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l1_model_bilstm = leaky_lstm(num_embeddings=len(TEXT.vocab),
                             embedding_dim=300,
                             hidden_size=300,
                             n_classes=1,
                             dropout=0,
                             bidirectional=True)

print(l1_model_bilstm, "\n")
print(f'The model has {count_parameters(l1_model_bilstm):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l1_model_bilstm.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.MSELoss()

train(l1_model_bilstm,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      accuracy_function=None,
      save=True,
      saving_path="model_bck/l1_model_bilstm.pt")

leaky_lstm(
  (embedding): Embedding(15, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0, inplace=False)
  (linear1): Linear(in_features=600, out_features=600, bias=True)
  (linear2): Linear(in_features=600, out_features=600, bias=True)
  (linear3): Linear(in_features=600, out_features=1, bias=True)
) 

The model has 2,171,101 trainable parameters. 

	No. epoch: 1/30                      
	   Train Loss: 0.053
	    Val. Loss: 0.034


	No. epoch: 2/30                      
	   Train Loss: 0.012
	    Val. Loss: 0.007


	No. epoch: 3/30                      
	   Train Loss: 0.008
	    Val. Loss: 0.005


	No. epoch: 4/30                      
	   Train Loss: 0.005
	    Val. Loss: 0.004


	No. epoch: 5/30                      
	   Train Loss: 0.004
	    Val. Loss: 0.005


	No. epoch: 6/30                      
	   Train Loss: 0.004
	    Val. Loss: 0.002


	No. epoch: 7/30                      
	   Train Loss: 0.003
	    Val. Loss: 0.002


	No. ep

In [5]:
predict(l1_model_bilstm, "N = C ( N C = O ) C = O", TEXT)

-1.073487639427185

# Poisson label 2

In [6]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None),
          ('label', LABEL), (None, None), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

Wall time: 1.21 s


In [7]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l2_model_bilstm = leaky_lstm(num_embeddings=len(TEXT.vocab),
                             embedding_dim=300,
                             hidden_size=300,
                             n_classes=1,
                             dropout=0,
                             bidirectional=True)

print(l2_model_bilstm, "\n")
print(f'The model has {count_parameters(l2_model_bilstm):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l2_model_bilstm.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.PoissonNLLLoss()

train(l2_model_bilstm,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      save=True,
      accuracy_function=poisson_accuracy,
      saving_path="model_bck/l2_model_bilstm.pt")

leaky_lstm(
  (embedding): Embedding(15, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0, inplace=False)
  (linear1): Linear(in_features=600, out_features=600, bias=True)
  (linear2): Linear(in_features=600, out_features=600, bias=True)
  (linear3): Linear(in_features=600, out_features=1, bias=True)
) 

The model has 2,171,101 trainable parameters. 

	No. epoch: 1/30                      
	   Train Loss: 0.362 | Train Acc: 87.419%
	    Val. Loss: 0.299 |  Val. Acc: 95.312%


	No. epoch: 2/30                      
	   Train Loss: 0.295 | Train Acc: 95.797%
	    Val. Loss: 0.293 |  Val. Acc: 97.701%


	No. epoch: 3/30                      
	   Train Loss: 0.292 | Train Acc: 96.602%
	    Val. Loss: 0.291 |  Val. Acc: 98.263%


	No. epoch: 4/30                      
	   Train Loss: 0.288 | Train Acc: 98.118%
	    Val. Loss: 0.289 |  Val. Acc: 99.251%


	No. epoch: 5/30                      
	   Train Loss: 0.289 | Train Acc: 97.813%
	    Val. Lo

In [8]:
np.exp(predict(l2_model_bilstm, "N = C ( N C = O ) C = O", TEXT))

1.9738332742871547

# Normal label 3

In [9]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), (None, None),
          ('label', LABEL), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

Wall time: 1.23 s


In [10]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l3_model_bilstm = leaky_lstm(num_embeddings=len(TEXT.vocab),
                             embedding_dim=300,
                             hidden_size=300,
                             n_classes=1,
                             dropout=0,
                             bidirectional=True)

print(l3_model_bilstm, "\n")
print(
    f'The model has {count_parameters(l3_model_bilstm):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l3_model_bilstm.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.MSELoss()

train(l3_model_bilstm,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      accuracy_function=None,
      save=True,
      saving_path="model_bck/l3_model_bilstm.pt")

leaky_lstm(
  (embedding): Embedding(15, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0, inplace=False)
  (linear1): Linear(in_features=600, out_features=600, bias=True)
  (linear2): Linear(in_features=600, out_features=600, bias=True)
  (linear3): Linear(in_features=600, out_features=1, bias=True)
) 

The model has 2,171,101 trainable parameters. 

	No. epoch: 1/30                      
	   Train Loss: 318.058
	    Val. Loss: 63.227


	No. epoch: 2/30                      
	   Train Loss: 94.958
	    Val. Loss: 140.299


	No. epoch: 3/30                      
	   Train Loss: 73.459
	    Val. Loss: 140.996


	No. epoch: 4/30                      
	   Train Loss: 50.505
	    Val. Loss: 40.928


	No. epoch: 5/30                      
	   Train Loss: 7.371
	    Val. Loss: 1.517


	No. epoch: 6/30                      
	   Train Loss: 1.319
	    Val. Loss: 1.357


	No. epoch: 7/30                      
	   Train Loss: 4.601
	    Val. Loss: 1.47

In [11]:
predict(l3_model_bilstm, "N = C ( N C = O ) C = O", TEXT)

99.77494049072266

# Poisson label 4

In [12]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), (None, None),
          (None, None), ('label', LABEL)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

Wall time: 1.01 s


In [13]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l4_model_bilstm = leaky_lstm(num_embeddings=len(TEXT.vocab),
                             embedding_dim=300,
                             hidden_size=300,
                             n_classes=1,
                             dropout=0,
                             bidirectional=True)

print(l4_model_bilstm, "\n")
print(f'The model has {count_parameters(l4_model_bilstm):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l4_model_bilstm.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.PoissonNLLLoss()

train(l4_model_bilstm,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      save=True,
      accuracy_function=poisson_accuracy,
      saving_path="model_bck/l4_model_bilstm.pt")

leaky_lstm(
  (embedding): Embedding(15, 300)
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0, inplace=False)
  (linear1): Linear(in_features=600, out_features=600, bias=True)
  (linear2): Linear(in_features=600, out_features=600, bias=True)
  (linear3): Linear(in_features=600, out_features=1, bias=True)
) 

The model has 2,171,101 trainable parameters. 

	No. epoch: 1/30                      
	   Train Loss: 0.269 | Train Acc: 82.561%
	    Val. Loss: 0.213 |  Val. Acc: 87.339%


	No. epoch: 2/30                      
	   Train Loss: 0.244 | Train Acc: 90.381%
	    Val. Loss: 0.204 |  Val. Acc: 93.543%


	No. epoch: 3/30                      
	   Train Loss: 0.247 | Train Acc: 90.96%
	    Val. Loss: 0.206 |  Val. Acc: 92.066%


	No. epoch: 4/30                      
	   Train Loss: 0.237 | Train Acc: 95.039%
	    Val. Loss: 0.203 |  Val. Acc: 94.182%


	No. epoch: 5/30                      
	   Train Loss: 0.234 | Train Acc: 96.557%
	    Val. Los

In [14]:
np.exp(predict(l4_model_bilstm, "N = C ( N C = O ) C = O", TEXT))

4.1618937648514803e-11

In [15]:
# model = torch.load("model_bck/l4_model_bilstm.pt")
# model.eval()