In [1]:
%matplotlib inline


In [2]:
import os
import sys
import datetime
from pathlib import Path
import logging
import torch
from transformers import T5Tokenizer
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from src.models.model_T5 import FlanT5FineTuner
from src.data_loader import create_dataloaders
from src.utils.config import CONFIG
from bertviz import model_view, head_view
import matplotlib.pyplot as plt
import seaborn as sns

# Set the specific GPU to use
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Adjust the index to select a different GPU

# Check if CUDA is available and print the device
if torch.cuda.is_available():
    print("CUDA is available. Configuring to use GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU.")
    device = torch.device("cpu")

def setup_trainer(model_dir):
    checkpoint_callback = ModelCheckpoint(
        dirpath=model_dir,
        filename='checkpoint-{epoch:02d}-{val_loss:.2f}',
        save_top_k=1,
        monitor='val_loss',
        mode='min',
        verbose=True
    )
    
    tensorboard_logger = TensorBoardLogger(save_dir=model_dir, name="training_logs")
    trainer = Trainer(
            max_epochs=CONFIG["max_epochs"],
            accelerator='gpu' if torch.cuda.is_available() else 'cpu',
            devices=1 if torch.cuda.is_available() else None,
            callbacks=[checkpoint_callback],
            logger=tensorboard_logger
        )
    return trainer

def setup_model(model_dir):
    model = FlanT5FineTuner(CONFIG["model_name"], model_dir)
    model = model.to(device)  # Explicitly move your model to the correct device
    return model

def setup_dataloaders(model, tokenizer):
    data_path = CONFIG["data_dir"] / 'transformed'
    batch_size = CONFIG["batch_size"]
    num_workers = CONFIG["num_workers"]

    dataloaders = create_dataloaders(data_path, tokenizer, batch_size, num_workers)
    return dataloaders

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Directory for models and logs
model_timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H")
model_dir = Path(CONFIG["models_dir"]) / f"model_{model_timestamp}"
model_dir.mkdir(parents=True, exist_ok=True)

# Setup Model, Trainer, and Dataloaders
tokenizer = T5Tokenizer.from_pretrained(CONFIG["model_name"])
model = setup_model(model_dir)
dataloaders = setup_dataloaders(model, tokenizer)
trainer = setup_trainer(model_dir)

# Training
try:
    trainer.fit(model, dataloaders['train_supervised_small_sample'], dataloaders['dev_data_sample'])
    trainer.test(model, dataloaders['test_data_sample'])
except Exception as e:
    logger.exception("An error occurred during training or testing.")

# Generate text and extract attentions
input_batch = next(iter(dataloaders['dev_data_sample']))
input_ids = input_batch['input_ids'].to(device)
attention_mask = input_batch['attention_mask'].to(device)

generated_texts, attentions = model.generate_text(input_ids, attention_mask)

# Visualize attention
model.visualize_attention_bertviz(input_ids, attention_mask, input_batch['original_ending'], attentions)


  from .autonotebook import tqdm as notebook_tqdm


CUDA is available. Configuring to use GPU.


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Checking file: /data/agirard/Projects/Timetravel/data/transformed/train_supervised_small_sample.json
Checking file: /data/agirard/Projects/Timetravel/data/transformed/dev_data_sample.json
Checking file: /data/agirard/Projects/Timetravel/data/transformed/test_data_sample.json


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A40') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/data/agirard/anaconda3/envs/myenv/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /data/agirard/Projects/Timetravel/models/model_2024-05-21-14 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 247 M 
-----------------------------------------------------
247 M     Trainable params

Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

ERROR:__main__:An error occurred during training or testing.
Traceback (most recent call last):
  File "/tmp/ipykernel_1466926/1528999539.py", line 79, in <module>
    trainer.fit(model, dataloaders['train_supervised_small_sample'], dataloaders['dev_data_sample'])
  File "/data/agirard/anaconda3/envs/myenv/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit
    call._call_and_handle_interrupt(
  File "/data/agirard/anaconda3/envs/myenv/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/agirard/anaconda3/envs/myenv/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/data/agirard/anaconda3/envs/myenv/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py", line 989, in _run
    results = self._run_stage()
        

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)