https://github.com/Ryota-Kawamura/Generative-AI-with-LLMs/blob/main/Week-2/Lab_2_fine_tune_generative_ai_model.ipynb

In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

%pip install -U datasets huggingface_hub fsspec

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.3.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m690.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m144.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

## HF_TOKEN : authentication is recommended but still optional to access public models or datasets.

RuntimeError: Failed to import transformers.training_args because of the following error (look up to see its traceback):
/usr/local/lib/python3.10/dist-packages/_XLAC.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK5torch4lazy17LazyGraphExecutor16ShouldSyncTensorERKN3c1013intrusive_ptrINS0_10LazyTensorENS2_6detail34intrusive_target_default_null_typeIS4_EEEE

In [None]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset

In [None]:
# Load the pre-trained FLAN-T5 model and its tokenizer directly from HuggingFace.
# We will be using the small version of FLAN-T5.
# Setting torch_dtype=torch.bfloat16 specifies the memory type to be used by this model.

model_name='google/flan-t5-small'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

In [None]:
display(dataset['test'][1]['dialogue'])
print()
display(dataset['test'][1]['summary'])

<mark> Output of Pre-Trained model

In [None]:
index = 200
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')

print(inputs.keys())
print(inputs['input_ids'].shape)
display(inputs)

In [None]:
model_output = original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0]
output = tokenizer.decode(
    model_output,
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

<mark> 2 - Perform Full Fine-Tuning

We need to convert the dialog-summary (prompt-response) pairs into explicit instructions for the LLM. Prepend an instruction to the start of the dialog with Summarize the following conversation and to the start of the summary with Summary.

Then preprocess the prompt-response dataset into tokens and pull out their input_ids (1 per token).

In [None]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

In [None]:
# To save some time in the lab, you will subsample the dataset:

tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [None]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

In [None]:
# Now utilize the built-in Hugging Face Trainer class. Pass the preprocessed dataset with reference to the original model.

output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=50, # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs.
    report_to='none'  #To disable calling wandb white training
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
trainer.train()

In [None]:
## Explitcitely save traned model to output directory

trainer.model.save_pretrained(output_dir)
# trainer.tokenizer.save_pretrained(output_dir)

In [None]:
## Training a fully fine-tuned version of the model would take a few hours on a GPU. To save time, download a checkpoint of the fully fine-tuned model to use
## Create an instance of the AutoModelForSeq2SeqLM class for the instruct model:

from transformers import AutoModelForSeq2SeqLM
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)

In [None]:
import torch

def move_all_tensors_to_cpu():
    """Moves all tensors currently on CUDA to the CPU."""
    for obj in globals().values():
        if isinstance(obj, torch.Tensor) and obj.device.type == 'cuda':
            obj.cpu()  # Moves the tensor to CPU

# Call the function to move all CUDA tensors to CPU
move_all_tensors_to_cpu()

In [None]:
# Evaluate the Model Qualitatively (Human Evaluation)

index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors="pt")

original_model_outputs = original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0]
original_model_text_output = tokenizer.decode(
    model_output,
    skip_special_tokens=True
)

# instruct_model_outputs = instruct_model.generate(
#     inputs["input_ids"],
#     max_new_tokens=200,
#     )[0]
# instruct_model_text_output = tokenizer.decode(
#     instruct_model_outputs,
#     skip_special_tokens=True
#     )

# print(dash_line)
# print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
# print(dash_line)
# print(f'ORIGINAL MODEL:\n{original_model_text_output}')
# print(dash_line)
# print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

In [None]:
import torch

def get_tensor_names_on_devices():
    """Returns a dictionary with lists of tensor names on CPU and CUDA devices."""
    tensor_names_on_devices = {'cpu': [], 'cuda': []}

    for name, obj in globals().items():  # Use .items() to get names
        if isinstance(obj, torch.Tensor):
            if obj.device.type == 'cpu':
                tensor_names_on_devices['cpu'].append(name)
            elif obj.device.type == 'cuda':
                tensor_names_on_devices['cuda'].append(name)

    return tensor_names_on_devices

# Get the tensor names on each device
tensor_names_on_devices = get_tensor_names_on_devices()

# Print the lists
print("Tensor names on CPU:", tensor_names_on_devices['cpu'])
print("Tensor names on CUDA:", tensor_names_on_devices['cuda'])