In [None]:
!python preprocessing.ipynb

In [None]:
!pip install transformers datasets peft bert_score rouge_score evaluate sacrebleu



In [None]:
!pip install wandb -qU

In [None]:
import wandb

In [None]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcgpknowledge[0m ([33mcgpknowledge-indraprastha-institute-of-information-techn[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

**Imports**

In [None]:

# --- Imports ---
import json
import torch
import evaluate
import numpy as np
from tqdm import tqdm
from datasets import Dataset
from torch.utils.data import DataLoader
import os

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)

print("Libraries imported successfully.")

# --- Setup Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"CUDA Device Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Capability: {torch.cuda.get_device_capability(0)}")


Libraries imported successfully.
Using device: cuda
CUDA Device Name: Tesla T4
CUDA Capability: (7, 5)


**Data Loading**

In [None]:
# --- Data Loading Function ---
def load_data(json_path):
    """Loads dialogue and summary pairs from the specified JSON file."""
    if not os.path.exists(json_path):
        print(f"Error: File not found at {json_path}")
        return [], []
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if not isinstance(data, list):
             print(f"Warning: JSON file at {json_path} should contain a list of objects.")
             return [], [] # Return empty if format is wrong

        inputs = []
        summaries = []
        malformed_count = 0
        for item in data:
             if isinstance(item, dict) and "input_text" in item and isinstance(item["input_text"], list) and len(item["input_text"]) == 2:
                  inputs.append(str(item["input_text"][0])) # Ensure strings
                  summaries.append(str(item["input_text"][1])) # Ensure strings
             else:
                  malformed_count += 1

        if malformed_count > 0:
             print(f"Warning: Filtered out {malformed_count} items from {json_path} due to missing/incorrect 'input_text' format.")
        print(f"Loaded {len(inputs)} examples from {json_path}")
        return inputs, summaries
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {json_path}. Check file integrity.")
        return [], []
    except Exception as e:
        print(f"An unexpected error occurred loading data from {json_path}: {e}")
        return [], []


In [None]:
# --- Define File Paths --->>>
train_file = "train_preprocess_v2.json"
val_file = "validation_preprocess_v2.json"

# --- Load Data ---
print("Loading data...")
train_inputs, train_targets = load_data(train_file)
val_inputs, val_targets = load_data(val_file)
# test_inputs, test_targets = load_data(test_file) # Uncomment if needed

if not train_inputs or not val_inputs:
    print("Error: Could not load sufficient training or validation data. Please check file paths and content. Exiting.")
    exit()
print("Data loaded.")

Loading data...
Loaded 131 examples from train_preprocess_v2.json
Loaded 21 examples from validation_preprocess_v2.json
Data loaded.


**Tokenzier**

In [None]:

# --- Model and Tokenizer ---
model_name = "t5-large"
print(f"Loading tokenizer and model '{model_name}'...")
try:
    # Using legacy=False is recommended for T5 tokenizers if available
    tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer '{model_name}': {e}")
    print("Please ensure the model name is correct and you have internet access if downloading.")
    exit()


Loading tokenizer and model 't5-large'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model and tokenizer loaded successfully.


In [None]:
# --- Constants ---
MAX_INPUT = 512
MAX_OUTPUT = 150
print(f"MAX_INPUT_LENGTH={MAX_INPUT}, MAX_OUTPUT_LENGTH={MAX_OUTPUT}")

MAX_INPUT_LENGTH=512, MAX_OUTPUT_LENGTH=150


**Merging Dataset with hugging face library**

In [None]:

# --- Create Hugging Face Datasets ---
print("Creating Hugging Face datasets...")
try:
    train_data = {"input_text": train_inputs, "target_text": train_targets}
    raw_train_dataset = Dataset.from_dict(train_data)

    val_data = {"input_text": val_inputs, "target_text": val_targets}
    raw_val_dataset = Dataset.from_dict(val_data)

    print("Datasets created.")
except Exception as e:
    print(f"Error creating Hugging Face datasets: {e}")
    exit()

Creating Hugging Face datasets...
Datasets created.


In [None]:
# --- Tokenization Function ---
def tokenize_batch(examples):
    """Tokenizes batches of input and target text for T5."""
    # Add prefix for T5 summarization task - this often helps performance
    task_prefix = "summarize: "
    inputs_with_prefix = [task_prefix + text for text in examples["input_text"]]

    model_inputs = tokenizer(
        inputs_with_prefix, # Use prefixed inputs
        max_length=MAX_INPUT,
        truncation=True,
        padding="max_length" # Pad to max length initially
    )
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=MAX_OUTPUT,
            truncation=True,
            padding="max_length" # Pad to max length initially
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# --- Apply Tokenization ---
print("Tokenizing datasets (this may take a while)...")
try:
    tokenized_train_dataset = raw_train_dataset.map(
        tokenize_batch,
        batched=True,
        remove_columns=["input_text", "target_text"], # Remove original text columns
        desc="Tokenizing Training Set" # Add description for progress bar
    )
    tokenized_val_dataset = raw_val_dataset.map(
        tokenize_batch,
        batched=True,
        remove_columns=["input_text", "target_text"],
        desc="Tokenizing Validation Set"
    )
    # if 'raw_test_dataset' in locals(): # Uncomment if using test set
    #     tokenized_test_dataset = raw_test_dataset.map(tokenize_batch, batched=True, remove_columns=["input_text", "target_text"], desc="Tokenizing Test Set")
    print("Tokenization complete.")
except Exception as e:
    print(f"Error during tokenization: {e}")
    exit()

Tokenizing datasets (this may take a while)...


Tokenizing Training Set:   0%|          | 0/131 [00:00<?, ? examples/s]



Tokenizing Validation Set:   0%|          | 0/21 [00:00<?, ? examples/s]

Tokenization complete.


**Loading Metric**

In [None]:
print("Loading evaluation metrics...")
metrics_loaded_successfully = False

try:
    # Load different metrics
    bleu_metric = evaluate.load("bleu")  # Standard BLEU (not sacrebleu)
    rouge_metric = evaluate.load("rouge")
    meteor_metric = evaluate.load("meteor")
    bertscore_metric = evaluate.load("bertscore")

    print("BLEU, ROUGE, METEOR, and BERTScore metrics loaded successfully.")
    metrics_loaded_successfully = True

except Exception as e:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!! CRITICAL WARNING: Error loading evaluation metrics:", e)
    print("!!! Please check your internet connection and library installations:")
    print("!!! pip install --upgrade evaluate bert_score")
    print("!!! Manual evaluation at the end will NOT calculate all metrics.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

    bleu_metric = None
    rouge_metric = None
    meteor_metric = None
    bertscore_metric = None

Loading evaluation metrics...


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

BLEU, ROUGE, METEOR, and BERTScore metrics loaded successfully.


**Model Paramters**

In [None]:
# --- Training Arguments ---

training_args = TrainingArguments(
    output_dir="./results_t5large_manual_eval",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs_t5large_manual_eval',
    logging_strategy="steps",
    logging_steps=100,                    # Log every 100 steps
    save_strategy="epoch",                # Save model checkpoint every epoch
    save_total_limit=2,
    report_to="wandb",
    fp16=torch.cuda.is_available(),

)


**Data Collector**

In [None]:
# --- Data Collator ---

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest" # Pad to longest sequence in batch, not absolute max_length
)

In [None]:


if training_args.report_to == "wandb":
    try:
        import wandb
        wandb.login()
    except ImportError:
        print("Wandb not installed. Disabling Wandb logging.")
        training_args.report_to = "none"
    except Exception as e:
        print(f"Could not log in to Wandb: {e}. Disabling Wandb logging.")
        training_args.report_to = "none"

In [None]:
# --- Initialize Trainer ---
# Using standard Trainer - no compute_metrics or eval_dataset needed here for automated eval
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


**Training Model**

In [None]:
# --- Train the Model ---
print("Starting training...")
try:
    train_result = trainer.train()
    print("Training finished.")
    # Log training metrics (like loss)
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
except Exception as e:
    print(f"An error occurred during training: {e}")
    # Optionally save state even if training failed partway through
    # trainer.save_state()
    exit() # Exit if training fails



Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,0.0


Training finished.
***** train metrics *****
  epoch                    =     9.4242
  total_flos               =  2490201GF
  train_loss               =        0.0
  train_runtime            = 0:10:22.24
  train_samples_per_second =      2.105
  train_steps_per_second   =      0.257


**Save Best Model**

In [None]:
# --- Save the Final Trained Model ---
final_model_path = os.path.join(training_args.output_dir, "final_model")
print(f"Saving final model to {final_model_path}...")
trainer.save_model(final_model_path)
# tokenizer.save_pretrained(final_model_path) # Already saved by save_model
print(f"Final model and tokenizer saved.")

# --- Manual Evaluation Loop ---
print("\nStarting manual evaluation on the validation set...")

# Ensure model is in eval mode and on the correct device
model.eval()

Saving final model to ./results_t5large_manual_eval/final_model...
Final model and tokenizer saved.

Starting manual evaluation on the validation set...


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

**Adjust Batch Size**

In [None]:
# Create DataLoader for validation set
# <<< Adjust eval_batch_size based on GPU memory available for generation >>>
eval_batch_size = 4 # Start with 4, decrease if you get Out-of-Memory errors
try:
    val_dataloader = DataLoader(
        tokenized_val_dataset,
        batch_size=eval_batch_size,
        collate_fn=data_collator # Use the same collator used for training
    )
except Exception as e:
    print(f"Error creating validation DataLoader: {e}")
    exit() # Exit if DataLoader fails

**Manual Checking**

In [None]:
all_preds = []
all_labels = []

print(f"Running generation on validation set with batch size {eval_batch_size}...")
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Manual Evaluation"):
        batch = {k: v.to(device) for k, v in batch.items()}

        try:
            # Generate predictions using model.generate
            generated_ids = model.generate(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                max_length=MAX_OUTPUT + 10,
                num_beams=4,
                early_stopping=True,
            )

            # Decode predictions
            decoded_preds_batch = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

            # Decode labels (replace -100)
            labels_batch = batch['labels'].cpu().numpy()
            labels_batch = np.where(labels_batch != -100, labels_batch, tokenizer.pad_token_id)
            decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)

            # Store for final metric calculation
            all_preds.extend([pred.strip() for pred in decoded_preds_batch])
            all_labels.extend([label.strip() for label in decoded_labels_batch])

        except RuntimeError as e:
            if "out of memory" in str(e).lower():
                print("\nCUDA out of memory during generation!")
                print(f"Try reducing `eval_batch_size` (currently {eval_batch_size}).")
                # Optionally break or exit
                break # Stop evaluation if OOM occurs
            else:
                print(f"\nA runtime error occurred during generation: {e}")
                # Optionally continue to next batch or break
                continue # Try next batch
        except Exception as e:
            print(f"\nAn unexpected error occurred during generation batch: {e}")
            continue # Try next batch


Running generation on validation set with batch size 4...


Manual Evaluation: 100%|██████████| 6/6 [00:26<00:00,  4.43s/it]


**Metric Evaluation**

In [None]:
# --- Compute Metrics Manually ---
print("\n--- Manual Evaluation Results ---")
if not all_preds or not all_labels:
    print("No predictions or labels were generated/collected during manual evaluation.")
elif not metrics_loaded_successfully:
    print("Metrics could not be loaded earlier. Skipping metric calculation.")
    # Still print some example outputs for qualitative review
    print("\nSample Generated Summaries (up to 5):")
    for i in range(min(5, len(all_preds))):
        print(f"  Reference Summary {i+1}: {all_labels[i]}")
        print(f"  Generated Summary {i+1}: {all_preds[i]}")
        print("-" * 20)
else:
    print("Calculating final metrics...")
    labels_nested = [[label] for label in all_labels]  # For BLEU and METEOR
    labels_flat = all_labels  # For ROUGE and BERTScore

    try:
        bleu_result = bleu_metric.compute(predictions=all_preds, references=labels_nested)
        print(f"  Manual BLEU Score        : {bleu_result['bleu'] if 'bleu' in bleu_result else bleu_result['score']:.4f}")
    except Exception as e:
        print(f"  Could not compute BLEU: {e}")

    try:
        rouge_result = rouge_metric.compute(predictions=all_preds, references=labels_flat)
        print(f"  Manual ROUGE-L Score     : {rouge_result['rougeL']:.4f}")
    except Exception as e:
        print(f"  Could not compute ROUGE: {e}")

    try:
        meteor_result = meteor_metric.compute(predictions=all_preds, references=labels_nested)
        print(f"  Manual METEOR Score      : {meteor_result['meteor']:.4f}")
    except Exception as e:
        print(f"  Could not compute METEOR: {e}")

    try:
        bert_result = bertscore_metric.compute(
            predictions=all_preds,
            references=labels_flat,
            lang="en",
            model_type="microsoft/deberta-xlarge-mnli",
            device=device
        )
        avg_bert_f1 = np.mean(bert_result['f1'])
        print(f"  Manual BERTScore (Avg F1): {avg_bert_f1:.4f}")
    except Exception as e:
        print(f"  Could not compute BERTScore: {e}")

    # Qualitative review
    print("\nSample Generated Summaries (up to 5):")
    for i in range(min(5, len(all_preds))):
        print(f"  Reference Summary {i+1}: {all_labels[i]}")
        print(f"  Generated Summary {i+1}: {all_preds[i]}")
        print("-" * 20)

# --- Final Message ---
print("Script finished.")



--- Manual Evaluation Results ---
Calculating final metrics...
  Manual BLEU Score        : 0.0000
  Manual ROUGE-L Score     : 0.1072
  Manual METEOR Score      : 0.1202


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

  Manual BERTScore (Avg F1): 0.5286

Sample Generated Summaries (up to 5):
  Reference Summary 1: patient feels anxious everytime go work . work insurance company admin assistant . patient start feel anxious drive job prefer interacting coworkers isolate office room . patient feels anxious big crowds like meetings presentations urge leave . patient feels stomach pains blush nervous . patient feels anxious anticipation planned meetings much time worry unplanned meetings . either ways patient put spot present long discussion feel anxious . patient feel anything alone office . therapist assures set counselor .
  Generated Summary 1: patient: started new jobs even new academic environments would feel somewhat anxious . therapist: distressing ? must level distress disruptive functioning .
--------------------
  Reference Summary 2: patient irritated every little thing week . irritating events bother patient racing thoughts distract work . sleep fine thoughts subside soon . patient feel depr

**Test Evaluation**

In [None]:
# --- Imports ---
import json
import torch
import evaluate
import numpy as np
from tqdm import tqdm
from datasets import Dataset
from torch.utils.data import DataLoader
import os
import time

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq
)

print("Libraries imported successfully for testing.")

# --- Configuration ---
# <<< IMPORTANT: Set these paths correctly >>>
MODEL_PATH = "./results_t5large_manual_eval/final_model/"
TEST_JSON_PATH = "test_preprocess_v2.json"

# --- Constants---
MAX_INPUT = 512
MAX_OUTPUT = 150
EVAL_BATCH_SIZE = 4
TASK_PREFIX = "summarize: "

print(f"Model Path: {MODEL_PATH}")
print(f"Test Data Path: {TEST_JSON_PATH}")
print(f"Max Input/Output Lengths: {MAX_INPUT}/{MAX_OUTPUT}")
print(f"Evaluation Batch Size: {EVAL_BATCH_SIZE}")

# --- Setup Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if not os.path.exists(MODEL_PATH):
    print(f"Error: Model path not found at '{MODEL_PATH}'. Please check the path.")
    exit()
if not os.path.exists(TEST_JSON_PATH):
    print(f"Error: Test data file not found at '{TEST_JSON_PATH}'. Please check the path.")
    exit()

# --- Load Fine-Tuned Model and Tokenizer ---
print(f"Loading fine-tuned model and tokenizer from {MODEL_PATH}...")
try:
    tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH, legacy=False)
    model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
    model.eval() # Set model to evaluation mode immediately
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer from {MODEL_PATH}: {e}")
    exit()

# --- Data Loading Function ---
def load_data(json_path):
    """Loads dialogue and summary pairs from the specified JSON file."""
    if not os.path.exists(json_path):
        print(f"Error: File not found at {json_path}")
        return [], []
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if not isinstance(data, list):
             print(f"Warning: JSON file at {json_path} should contain a list of objects.")
             return [], []
        inputs = []
        summaries = []
        malformed_count = 0
        for item in data:
             if isinstance(item, dict) and "input_text" in item and isinstance(item["input_text"], list) and len(item["input_text"]) == 2:
                  inputs.append(str(item["input_text"][0]))
                  summaries.append(str(item["input_text"][1]))
             else:
                  malformed_count += 1
        if malformed_count > 0:
             print(f"Warning: Filtered out {malformed_count} items from {json_path} due to missing/incorrect 'input_text' format.")
        print(f"Loaded {len(inputs)} test examples from {json_path}")
        return inputs, summaries
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {json_path}. Check file integrity.")
        return [], []
    except Exception as e:
        print(f"An unexpected error occurred loading data from {json_path}: {e}")
        return [], []

# --- Load Test Data ---
print("Loading test data...")
test_inputs, test_targets = load_data(TEST_JSON_PATH)
if not test_inputs:
    print("Error: No test data loaded. Exiting.")
    exit()

# --- Create Test Dataset ---
try:
    test_data = {"input_text": test_inputs, "target_text": test_targets}
    raw_test_dataset = Dataset.from_dict(test_data)
    print("Test dataset created.")
except Exception as e:
    print(f"Error creating test dataset: {e}")
    exit()

# --- Tokenization Function (Copied from training script) ---
def tokenize_batch(examples):
    """Tokenizes batches of input and target text for T5 evaluation."""
    inputs_with_prefix = [TASK_PREFIX + text for text in examples["input_text"]]

    model_inputs = tokenizer(
        inputs_with_prefix,
        max_length=MAX_INPUT,
        truncation=True,
        padding="max_length" # Pad initially, collator handles dynamic later
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=MAX_OUTPUT,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# --- Apply Tokenization to Test Set ---
print("Tokenizing test dataset...")
try:
    tokenized_test_dataset = raw_test_dataset.map(
        tokenize_batch,
        batched=True,
        remove_columns=["input_text", "target_text"],
        desc="Tokenizing Test Set"
    )
    print("Test data tokenization complete.")
except Exception as e:
    print(f"Error during test data tokenization: {e}")
    exit()

# --- Load Metrics ---
print("Loading evaluation metrics...")
metrics_loaded_successfully = False
try:
    bleu_metric = evaluate.load("sacrebleu")
    bertscore_metric = evaluate.load("bertscore")
    print("BLEU and BERTScore metrics loaded successfully.")
    metrics_loaded_successfully = True
except Exception as e:
    print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(f"!!! WARNING: Error loading evaluation metrics: {e}")
    print(f"!!! Evaluation will proceed, but BLEU/BERTScore will NOT be calculated.")
    print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    bleu_metric = None
    bertscore_metric = None

# --- Data Collator ---
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model, # Use the loaded model for pad token ID
    padding="longest" # Dynamic padding
)

# --- Evaluation Loop ---
print("\nStarting evaluation on the test set...")

# Create DataLoader for the test set
try:
    test_dataloader = DataLoader(
        tokenized_test_dataset,
        batch_size=EVAL_BATCH_SIZE,
        collate_fn=data_collator
    )
except Exception as e:
    print(f"Error creating test DataLoader: {e}")
    exit()

all_preds = []
all_labels = []
start_time = time.time()

print(f"Running generation on test set with batch size {EVAL_BATCH_SIZE}...")
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating Test Set"):
        batch = {k: v.to(device) for k, v in batch.items()}
        try:
            # Generate predictions
            generated_ids = model.generate(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                max_length=MAX_OUTPUT + 10,
                num_beams=4,
                early_stopping=True,
            )
            # Decode predictions
            decoded_preds_batch = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            # Decode labels
            labels_batch = batch['labels'].cpu().numpy()
            labels_batch = np.where(labels_batch != -100, labels_batch, tokenizer.pad_token_id)
            decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
            # Store
            all_preds.extend([pred.strip() for pred in decoded_preds_batch])
            all_labels.extend([label.strip() for label in decoded_labels_batch])
        except RuntimeError as e:
            if "out of memory" in str(e).lower():
                print(f"\nCUDA out of memory during generation on test set!")
                print(f"Try reducing EVAL_BATCH_SIZE (currently {EVAL_BATCH_SIZE}). Exiting.")
                exit() # Exit if OOM occurs during testing
            else:
                print(f"\nA runtime error occurred during generation: {e}")
                continue
        except Exception as e:
            print(f"\nAn unexpected error occurred during generation batch: {e}")
            continue

end_time = time.time()
evaluation_time = end_time - start_time
print(f"\nGeneration completed in {evaluation_time:.2f} seconds.")

# --- Compute and Display Metrics ---
print("\n--- Test Set Evaluation Results ---")
if not all_preds or not all_labels:
     print("No predictions or labels were generated/collected during evaluation.")
elif not metrics_loaded_successfully:
     print("Metrics (BLEU/BERTScore) could not be loaded. Skipping metric calculation.")
else:
    print("Calculating final metrics for the test set...")
    labels_bleu = [[label] for label in all_labels]
    labels_bert = all_labels

    try:
        bleu_result = bleu_metric.compute(predictions=all_preds, references=labels_bleu)
        print(f"  Test BLEU Score        : {bleu_result['score']:.4f}")
    except Exception as e:
        print(f"  Could not compute test BLEU: {e}")

    try:
        bert_result = bertscore_metric.compute(
            predictions=all_preds,
            references=labels_bert,
            lang="en",
            model_type="microsoft/deberta-xlarge-mnli",
            device=device
        )
        avg_bert_f1 = np.mean(bert_result['f1'])
        print(f"  Test BERTScore (Avg F1): {avg_bert_f1:.4f}")
    except Exception as e:
        print(f"  Could not compute test BERTScore: {e}")

# --- Print Sample Outputs from Test Set ---
print("\nSample Generated Summaries from Test Set (up to 5):")
for i in range(min(5, len(all_preds))):
    print(f"--- Example {i+1} ---")
    print(f"  Reference Summary: {all_labels[i]}")
    print(f"  Generated Summary: {all_preds[i]}")
    print("-" * 20)

print("\nTest script finished.")

Libraries imported successfully for testing.
Model Path: ./results_t5large_manual_eval/final_model/
Test Data Path: test_preprocess_v2.json
Max Input/Output Lengths: 512/150
Evaluation Batch Size: 4
Using device: cuda
Loading fine-tuned model and tokenizer from ./results_t5large_manual_eval/final_model/...
Model and tokenizer loaded successfully.
Loading test data...
Loaded 39 test examples from test_preprocess_v2.json
Test dataset created.
Tokenizing test dataset...


Tokenizing Test Set:   0%|          | 0/39 [00:00<?, ? examples/s]



Test data tokenization complete.
Loading evaluation metrics...


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

BLEU and BERTScore metrics loaded successfully.

Starting evaluation on the test set...
Running generation on test set with batch size 4...


Evaluating Test Set: 100%|██████████| 10/10 [00:42<00:00,  4.30s/it]



Generation completed in 42.96 seconds.

--- Test Set Evaluation Results ---
Calculating final metrics for the test set...
  Test BLEU Score        : 1.6931
  Test BERTScore (Avg F1): 0.5086

Sample Generated Summaries from Test Set (up to 5):
--- Example 1 ---
  Reference Summary: anxiety
  Generated Summary: patient: recently feeling lot . really like snap . like sometimes happened like get back track . recently found really difficult . therapist: low mood past kind able pull recently found harder .
--------------------
--- Example 2 ---
  Reference Summary: patient star athelete therapist counseling alcohol habits . patient share experience drinking affecting life replacing studies felt terrible disappointing mother broke trust . patient feels lost focus schoolwork keep discussing drinking friends . patient says reserved person drinking helps relax . patient also switched friends hang people drink . patient pretty confident cut drinking wish build trust back mother . therapist discu