In [None]:
from utils.neural_net_tools import *
from utils.leaky_lstm import *
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')


%load_ext autoreload
%autoreload 2

In [None]:
path_to_data = "dataset/text_molecule.csv"
format_ = "csv"
split_ratio = 5/6
min_freq_words = 1
batch_size = 128
seed = 1997
n_epochs = 10

# check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Normal label 1

In [None]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), ('label', LABEL),
          (None, None), (None, None), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

In [None]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l1_model_bilstm = leaky_lstm(num_embeddings=len(TEXT.vocab),
                             embedding_dim=300,
                             hidden_size=300,
                             n_classes=1,
                             dropout=0,
                             bidirectional=True)

print(l1_model_bilstm, "\n")
print(f'The model has {count_parameters(l1_model_bilstm):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l1_model_bilstm.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.MSELoss()

train(l1_model_bilstm,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      accuracy_function=None,
      save=True,
      saving_path="model_bck/l1_model_bilstm.bin")

In [None]:
predict(l1_model_bilstm, "N = C ( N C = O ) C = O", TEXT)

# Poisson label 2

In [None]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None),
          ('label', LABEL), (None, None), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

In [None]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l2_model_bilstm = leaky_lstm(num_embeddings=len(TEXT.vocab),
                             embedding_dim=300,
                             hidden_size=300,
                             n_classes=1,
                             dropout=0,
                             bidirectional=True)

print(l2_model_bilstm, "\n")
print(f'The model has {count_parameters(l2_model_bilstm):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l2_model_bilstm.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.PoissonNLLLoss()

train(l2_model_bilstm,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      save=True,
      accuracy_function=poisson_accuracy,
      saving_path="model_bck/l2_model_bilstm.bin")

In [None]:
np.exp(predict(l2_model_bilstm, "N = C ( N C = O ) C = O", TEXT))

# Normal label 3

In [None]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), (None, None),
          ('label', LABEL), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

In [None]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l3_model_bilstm = leaky_lstm(num_embeddings=len(TEXT.vocab),
                             embedding_dim=300,
                             hidden_size=300,
                             n_classes=1,
                             dropout=0,
                             bidirectional=True)

print(l3_model_bilstm, "\n")
print(
    f'The model has {count_parameters(l3_model_bilstm):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l3_model_bilstm.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.MSELoss()

train(l3_model_bilstm,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      accuracy_function=None,
      save=True,
      saving_path="model_bck/l3_model_bilstm.bin")

In [None]:
predict(l3_model_bilstm, "N = C ( N C = O ) C = O", TEXT)

# Poisson label 4

In [None]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), (None, None),
          (None, None), ('label', LABEL)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words)


# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

In [None]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
l4_model_bilstm = leaky_lstm(num_embeddings=len(TEXT.vocab),
                             embedding_dim=300,
                             hidden_size=300,
                             n_classes=1,
                             dropout=0,
                             bidirectional=True)

print(l4_model_bilstm, "\n")
print(f'The model has {count_parameters(l4_model_bilstm):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l4_model_bilstm.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.PoissonNLLLoss()

train(l4_model_bilstm,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      save=True,
      accuracy_function=poisson_accuracy,
      saving_path="model_bck/l4_model_bilstm.bin")

In [None]:
np.exp(predict(l4_model_bilstm, "N = C ( N C = O ) C = O", TEXT))