In [None]:
import torch
from torchtext import data
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')

from utils.multihead_attention_classifier import *
from utils.neural_net_tools import *

%load_ext autoreload
%autoreload 2

path_to_data = "dataset/text_molecule.csv"
format_ = "csv"
split_ratio = 5/6
min_freq_words = 1
batch_size = 128
seed = 1997
n_epochs = 10
fix_length = 22

# check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Normal label 1

In [None]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True, fix_length=fix_length)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), ('label', LABEL), (None, None), (None, None), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words) 



# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

In [None]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
# instantiate the model
l1_MultiheadAttentionClassifier = MultiheadAttentionClassifier(
    n_classes=1,
    num_embeddings=len(TEXT.vocab),
    embedding_dim=300,
    num_layers=10,
    heads=2,
    device="cuda",
    augmentation_factor=4,
    dropout=0,
    max_length=22,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token]).to("cuda")

print(l1_MultiheadAttentionClassifier, "\n")
print(f'The model has {count_parameters(l1_MultiheadAttentionClassifier):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l1_MultiheadAttentionClassifier.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.MSELoss()

MultiheadAttentionClassifier_train(l1_MultiheadAttentionClassifier,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      accuracy_function=None,
      save=True,
      saving_path="model_bck/l1_MultiheadAttentionClassifier.bin")

# Poisson label 2

In [None]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True, fix_length=fix_length)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), ('label', LABEL), (None, None), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words) 



# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

In [None]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
# instantiate the model
l2_MultiheadAttentionClassifier = MultiheadAttentionClassifier(
    n_classes=1,
    num_embeddings=len(TEXT.vocab),
    embedding_dim=300,
    num_layers=1,
    heads=2,
    device="cuda",
    augmentation_factor=4,
    dropout=0,
    max_length=22,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token]).to("cuda")

print(l2_MultiheadAttentionClassifier, "\n")
print(f'The model has {count_parameters(l2_MultiheadAttentionClassifier):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l2_MultiheadAttentionClassifier.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.PoissonNLLLoss()

MultiheadAttentionClassifier_train(l2_MultiheadAttentionClassifier,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      accuracy_function=poisson_accuracy,
      save=True,
      saving_path="model_bck/l2_MultiheadAttentionClassifier.bin")

# Normal label 3

In [None]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True, fix_length=fix_length)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), (None, None), ('label', LABEL), (None, None)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words) 



# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

In [None]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
# instantiate the model
l3_MultiheadAttentionClassifier = MultiheadAttentionClassifier(
    n_classes=1,
    num_embeddings=len(TEXT.vocab),
    embedding_dim=300,
    num_layers=10,
    heads=2,
    device="cuda",
    augmentation_factor=4,
    dropout=0,
    max_length=22,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token]).to("cuda")

print(l3_MultiheadAttentionClassifier, "\n")
print(f'The model has {count_parameters(l3_MultiheadAttentionClassifier):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l3_MultiheadAttentionClassifier.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.MSELoss()

MultiheadAttentionClassifier_train(l3_MultiheadAttentionClassifier,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      accuracy_function=None,
      save=True,
      saving_path="model_bck/l3_MultiheadAttentionClassifier.bin")

# Poisson label 4

In [None]:
%%time

TEXT = data.Field(batch_first=True,
                  include_lengths=True, fix_length=fix_length)

LABEL = data.Field(sequential=False, dtype=torch.float, batch_first=True,
                   use_vocab=False, preprocessing=float)

fields = [('text', TEXT), (None, None), (None, None), (None, None), ('label', LABEL)]

training_data = data.TabularDataset(path=path_to_data,
                                    format=format_,
                                    fields=fields,
                                    skip_header=True)

train_data, valid_data = training_data.split(split_ratio=split_ratio,
                                             random_state=random.seed(seed))

TEXT.build_vocab(training_data, min_freq=min_freq_words) 



# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size=batch_size,
                                                            sort_key=lambda x: len(x.text),
                                                            sort_within_batch=True,
                                                            device=device)

In [None]:
%%time

# Set seed.
torch.manual_seed(1997)
torch.backends.cudnn.deterministic = True

# instantiate the model
# instantiate the model
l4_MultiheadAttentionClassifier = MultiheadAttentionClassifier(
    n_classes=1,
    num_embeddings=len(TEXT.vocab),
    embedding_dim=300,
    num_layers=1,
    heads=2,
    device="cuda",
    augmentation_factor=4,
    dropout=0,
    max_length=22,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token]).to("cuda")

print(l4_MultiheadAttentionClassifier, "\n")
print(f'The model has {count_parameters(l4_MultiheadAttentionClassifier):,} trainable parameters.', "\n")

# Let us use Adam.
OPT = optim.Adam(l4_MultiheadAttentionClassifier.parameters(), lr=0.001)

# Specify criterion.
CRIT = nn.PoissonNLLLoss()

MultiheadAttentionClassifier_train(l4_MultiheadAttentionClassifier,
      train_iterator,
      valid_iterator,
      n_epochs,
      OPT,
      CRIT,
      accuracy_function=poisson_accuracy,
      save=True,
      saving_path="model_bck/l4_MultiheadAttentionClassifier.bin")