In [2]:
import os
import sys
import datetime
from pathlib import Path
import logging
import torch
from transformers import T5Tokenizer
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from src.models.model_T5 import FlanT5FineTuner
from src.data_loader import create_dataloaders
from src.utils.config import CONFIG
from bertviz import model_view

# Set the specific GPU to use
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Adjust the index to select a different GPU

# Check if CUDA is available and print the device
if torch.cuda.is_available():
    print("CUDA is available. Configuring to use GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU.")
    device = torch.device("cpu")

def setup_trainer(model_dir):
    checkpoint_callback = ModelCheckpoint(
        dirpath=model_dir,
        filename='checkpoint-{epoch:02d}-{val_loss:.2f}',
        save_top_k=1,
        monitor='val_loss',
        mode='min',
        verbose=True
    )
    
    tensorboard_logger = TensorBoardLogger(save_dir=model_dir, name="training_logs")
    trainer = Trainer(
            max_epochs=CONFIG["max_epochs"],
            accelerator='gpu' if torch.cuda.is_available() else 'cpu',
            devices=1 if torch.cuda.is_available() else None,
            callbacks=[checkpoint_callback],
            logger=tensorboard_logger
        )
    return trainer

def setup_model(model_dir):
    model = FlanT5FineTuner(CONFIG["model_name"], model_dir)
    model = model.to(device)  # Explicitly move your model to the correct device
    return model

def setup_dataloaders(model, tokenizer):
    data_path = CONFIG["data_dir"] / 'transformed'
    batch_size = CONFIG["batch_size"]
    num_workers = CONFIG["num_workers"]

    dataloaders = create_dataloaders(data_path, tokenizer, batch_size, num_workers)
    return dataloaders

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Directory for models and logs
model_timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H")
model_dir = Path(CONFIG["models_dir"]) / f"model_{model_timestamp}"
model_dir.mkdir(parents=True, exist_ok=True)

# Setup Model, Trainer, and Dataloaders
tokenizer = T5Tokenizer.from_pretrained(CONFIG["model_name"])
model = setup_model(model_dir)
dataloaders = setup_dataloaders(model, tokenizer)
trainer = setup_trainer(model_dir)

# Training
try:
    trainer.fit(model, dataloaders['train_supervised_small_sample'], dataloaders['dev_data_sample'])
    trainer.test(model, dataloaders['test_data_sample'])
except Exception as e:
    logger.exception("An error occurred during training or testing.")


CUDA is available. Configuring to use GPU.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5 Config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_attentions": true,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /data/agirard/Projects/Timetravel/models/model_2024-05-15-13/training_logs
/data/agirard/anaconda3/envs/myenv/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /data/agirard/Projects/Timetravel/models/model_2024-05-15-13 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 247 M 
-----------------------------------------------------
247 M     Trainable params
0         Non-trainable params
247 M     Total params
990.311   Total estimated model params size (MB)


Checking file: /data/agirard/Projects/Timetravel/data/transformed/train_supervised_small_sample.json
Checking file: /data/agirard/Projects/Timetravel/data/transformed/dev_data_sample.json
Checking file: /data/agirard/Projects/Timetravel/data/transformed/test_data_sample.json
Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  differential_weights_tensors = [torch.tensor(dw, dtype=torch.float).to(input_ids_padded.device) for dw in differential_weights]


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]The forward outputs odict_keys(['loss', 'logits', 'past_key_values', 'decoder_attentions', 'cross_attentions', 'encoder_last_hidden_state', 'encoder_attentions'])
No attentions were returned. Check model configuration.
No attentions were returned. Check model configuration.
                                                                           

/data/agirard/anaconda3/envs/myenv/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
  differential_weights_tensors = [torch.tensor(dw, dtype=torch.float).to(input_ids_padded.device) for dw in differential_weights]


Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  6.00it/s, v_num=0, train_loss_step=14.30]

  differential_weights_tensors = [torch.tensor(dw, dtype=torch.float).to(input_ids_padded.device) for dw in differential_weights]


The forward outputs odict_keys(['loss', 'logits', 'past_key_values', 'decoder_attentions', 'cross_attentions', 'encoder_last_hidden_state', 'encoder_attentions'])
No attentions were returned. Check model configuration.
No attentions were returned. Check model configuration.
Epoch 0: 100%|██████████| 1/1 [00:01<00:00,  0.76it/s, v_num=0, train_loss_step=14.30, val_loss=8.230, avg_val_loss=8.230, bleu_prediction_edited=0.248, bleu_prediction_cf=2.490, bleu_prediction_initial=2.790, bleu_prediction_original=0.211, bleu_edited_ending_cf=0.000, bleu_edited_ending_initial=0.000, bleu_edited_ending_original=0.000, rouge_prediction_edited_rouge-1_f=0.149, rouge_prediction_edited_rouge-1_p=0.417, rouge_prediction_edited_rouge-1_r=0.0913, rouge_prediction_edited_rouge-2_f=0.0294, rouge_prediction_edited_rouge-2_p=0.100, rouge_prediction_edited_rouge-2_r=0.0172, rouge_prediction_edited_rouge-l_f=0.149, rouge_prediction_edited_rouge-l_p=0.417, rouge_prediction_edited_rouge-l_r=0.0913, rouge_predic

Epoch 0, global step 1: 'val_loss' reached 8.22616 (best 8.22616), saving model to '/data/agirard/Projects/Timetravel/models/model_2024-05-15-13/checkpoint-epoch=00-val_loss=8.23.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 1/1 [00:10<00:00,  0.10it/s, v_num=0, train_loss_step=14.30, val_loss=8.230, avg_val_loss=8.230, bleu_prediction_edited=0.248, bleu_prediction_cf=2.490, bleu_prediction_initial=2.790, bleu_prediction_original=0.211, bleu_edited_ending_cf=0.000, bleu_edited_ending_initial=0.000, bleu_edited_ending_original=0.000, rouge_prediction_edited_rouge-1_f=0.149, rouge_prediction_edited_rouge-1_p=0.417, rouge_prediction_edited_rouge-1_r=0.0913, rouge_prediction_edited_rouge-2_f=0.0294, rouge_prediction_edited_rouge-2_p=0.100, rouge_prediction_edited_rouge-2_r=0.0172, rouge_prediction_edited_rouge-l_f=0.149, rouge_prediction_edited_rouge-l_p=0.417, rouge_prediction_edited_rouge-l_r=0.0913, rouge_prediction_cf_rouge-1_f=0.100, rouge_prediction_cf_rouge-1_p=0.167, rouge_prediction_cf_rouge-1_r=0.0714, rouge_prediction_cf_rouge-2_f=0.000, rouge_prediction_cf_rouge-2_p=0.000, rouge_prediction_cf_rouge-2_r=0.000, rouge_prediction_cf_rouge-l_f=0.100, rouge_prediction_cf_rouge-l

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

  differential_weights_tensors = [torch.tensor(dw, dtype=torch.float).to(input_ids_padded.device) for dw in differential_weights]


Testing DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]The forward outputs odict_keys(['loss', 'logits', 'past_key_values', 'decoder_attentions', 'cross_attentions', 'encoder_last_hidden_state', 'encoder_attentions'])
No attentions were returned. Check model configuration.
No attentions were returned. Check model configuration.
Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  1.25it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
             Test metric                           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
             avg_val_loss                       1.638059377670288
   bart_edited_ending_cf_avg_score              -2.972923517227173
 bart_edited_ending_initial_avg_score          -2.7492918968200684
bart_edited_ending_original_avg_score          -1.2895852327346802
     bart_prediction_cf_a