In [None]:
import torch
import os, sys
from transformers import AutoModel, AutoTokenizer
from pathlib import Path

HOME = os.getcwd()
current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

PARENT_DIR = str(current)
sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing'))

In [None]:
DATA_FOLDER = os.path.join(PARENT_DIR, 'src', 'data')
NOTEBOOK_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CHECKPOINT = 'roberta-base'  # let's keep it simple as for the first iteration
TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)

In [None]:
# let's focus
from datasets import load_dataset
data = load_dataset('csv', data_files=os.path.join(DATA_FOLDER, 'all_data_processed.csv'), split='train')
data = data.filter(function=lambda b: (isinstance(b['source'], str) and isinstance(b['target'], str)))
data.to_csv(os.path.join(DATA_FOLDER, 'all_data_processed.csv'), index=False)

In [None]:
# split the data \
import src.data_preparation.prepare_data as pdr
train_data, val_data, test_data = pdr.data_split(all_data=data.select(range(2000)))
train_data, val_data, test_data

In [None]:
def prepare_labeled_data(batch):
    model_inputs = TOKENIZER(batch['source'], truncation=True, max_length=1024)
    labels = TOKENIZER(batch['target'],truncation=True, max_length=1024)
    model_inputs['labels'] = labels['input_ids']
    # data['labels_attention_mask'] = labels['attention_mask']
    return model_inputs

train_data = train_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
val_data = val_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])
test_data = test_data.map(prepare_labeled_data, batched=True).remove_columns(['source', 'target'])

In [None]:
# we are now ready to create the dataloader
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq, DataCollatorWithPadding
# dc = DataCollatorWithPadding(tokenizer=TOKENIZER)
dc = DataCollatorForSeq2Seq(tokenizer=TOKENIZER, model=AutoModel.from_pretrained(CHECKPOINT))
train_dl = DataLoader(dataset=train_data, batch_size=4, shuffle=True, collate_fn=dc)
val_dl = DataLoader(dataset=val_data, batch_size=4, shuffle=False, collate_fn=dc)

In [None]:
next(iter(train_dl))        

In [None]:
# let's build the classifier for the Token prediction
import importlib
from torch import nn
import src.models.customSeq2Seq.model as s2s 
from src.models.customSeq2Seq.classification_head import ExponentialClassifier
from torch.optim import AdamW
importlib.reload(s2s)

classifier = ExponentialClassifier(num_classes=TOKENIZER.vocab_size, in_features=768, num_layers=5)
encoder = s2s.RobertaBasedEncoder()
# the output of the encoder
decoder = s2s.RobertaBasedDecoder(token_classifier=classifier, num_layers=2)
e_opt = AdamW(encoder.parameters(), lr=0.01)
d_opt = AdamW(decoder.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [None]:
from seq2seq_model import BertBasedEncoder, DecoderRNN
from typing import Optional, Union
from pathlib import Path
from tqdm import tqdm
from src.training_utilities.exp_tracking import create_summary_writer, _add_metric, report_results
from src.training_utilities.pytorch_utilities import save_model

In [None]:
import importlib
import train as tr
importlib.reload(tr)

def train_model(
                encoder: BertBasedEncoder, 
                decoder: DecoderRNN,
                train_dataloader: DataLoader[torch.Tensor],
                test_dataloader: DataLoader[torch.Tensor],
                loss_function,
                e_opt, 
                d_opt,
                epochs: int = 5,
                log_dir: Optional[Union[Path, str]] = None,
                save_path: Optional[Union[Path, str]] = None,
                ):

    save_path = save_path if save_path is not None else log_dir    

    # best_model, best_loss = None, None
    min_training_loss, no_improve_counter, best_model = float('inf'), 0, None

    # before proceeding with the training, let's set the summary writer
    writer, log_dir = (None, None) if (log_dir is None) else create_summary_writer(log_dir, return_path=True)

    for _ in tqdm(range(epochs)):
        epoch_train_loss, epoch_train_acc = tr.train_per_epoch(encoder=encoder,
                                                            decoder=decoder,
                                                            e_opt=e_opt,
                                                            d_opt=d_opt,
                                                            train_dataloader=train_dataloader,
                                                            loss_function=loss_function)

        epoch_val_loss, epoch_val_acc = tr.val_per_epoch(encoder=encoder,
                                                      decoder=decoder,
                                                      dataloader=test_dataloader,
                                                      loss_function=loss_function)

        no_improve_counter = no_improve_counter + 1 if min_training_loss < epoch_train_loss else 0

        if min_training_loss > epoch_train_loss:
            # save the model with the lowest training error
            min_training_loss = epoch_train_loss

        report_results(train_losses_dict={"train_loss": epoch_train_loss, "train_acc": epoch_train_acc}, 
                        val_losses_dict={"val_accuracy": epoch_val_acc, "val_loss": epoch_val_loss})

        _add_metric(writer,
                    tag='loss', 
                    values={"train_loss": epoch_train_loss, "val_loss": epoch_val_loss},
                    epoch=_)

        _add_metric(writer,
                    tag='accuracy', 
                    values={"train_loss": epoch_train_acc, "val_loss": epoch_val_acc},
                    epoch=_)
        
        if epoch_train_loss < min_training_loss: 
            best_model = (encoder, decoder)

    # save the best combination at the end
    save_model(model=encoder, path=os.path.join(save_path, 'encoder.pt'))
    save_model(model=decoder, path=os.path.join(save_path, 'decoder.pt'))

    return best_model

In [None]:
import importlib
import src.models.customSeq2Seq.train as tr
import src.models.customSeq2Seq.model as s2s

importlib.reload(tr)
importlib.reload(s2s)

train_model(encoder=encoder, 
            decoder=decoder, 
            train_dataloader=train_dl,
            test_dataloader=val_dl, 
            e_opt=e_opt, 
            d_opt=d_opt, 
            loss_function=criterion)