<a href="https://colab.research.google.com/github/anniepates/CS6120CustomerServiceChatbot/blob/main/flan_t5_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub

#pip list



In [2]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from datasets import Dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [3]:
# Mount drive to save model weights
# from google.colab import drive  # This requires Google drive connection, but can be skipped if you don't want to save model weights
# drive.mount('/content/drive')

In [4]:
# Import FLAN-T5 model
model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained('./drive/MyDrive/flan-t5-trained')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Preprocess data to tell model it needs to answer our question
my_question = "I am having the same issue. It did not link to my current account. Waste of money!"
inputs = "Answer this: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
#inputs.to(device='cuda')
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
from textwrap import fill

# Notice that untrained model gives very generic & unhelpful answer
print(fill(answer, width=100))



<pad> Let's troubleshoot this issue together. Please reach out for assistance.</s>


In [6]:
MODEL_NAME = model_name

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
"""
Here is where we provide our finetuning data to take our model from a general
chatbot to a customer service chatbot.
"""
myfile = open('QA300.txt', 'r', encoding="utf-8")
lines = myfile.readlines()

questions = []
answers = []

for line in lines:
  if not line.strip():
    continue
  if line.startswith("Q:"):
    questions.append(line.strip()[2:])
  elif line.startswith("A:"):
    answers.append(line.strip()[2:])

data_dict = {
    "id": [i for i in range(len(questions))],
    "question": questions,
    "answer": answers
}
mydataset = Dataset.from_dict(data_dict)

In [8]:
QA = mydataset.train_test_split(test_size=0.3)
QA

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer'],
        num_rows: 241
    })
    test: Dataset({
        features: ['id', 'question', 'answer'],
        num_rows: 104
    })
})

In [9]:
# We prefix our tasks with "answer the question"
prefix = "Answer this: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"],
                      max_length=512,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [10]:
# Map the preprocessing function across our dataset
tokenized_dataset = QA.map(preprocess_function, batched=True)

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [11]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [12]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

In [13]:
# Global Parameters
L_RATE = 1e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 10

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [14]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,1.584054,0.215861,0.09318,0.205582,0.213391
2,No log,1.501069,0.266597,0.109286,0.234985,0.265708
3,No log,1.473422,0.277989,0.117674,0.248932,0.271683
4,No log,1.546636,0.303928,0.122253,0.269162,0.295288
5,No log,1.589854,0.305035,0.116318,0.267013,0.294674
6,No log,1.704264,0.281733,0.099043,0.241906,0.272865
7,No log,1.796022,0.300185,0.11956,0.265055,0.294225
8,No log,1.853296,0.305474,0.123284,0.270261,0.299842
9,No log,1.866395,0.308958,0.129387,0.272979,0.30212
10,No log,1.871922,0.301931,0.123729,0.264363,0.293161


TrainOutput(global_step=310, training_loss=0.8547750165385585, metrics={'train_runtime': 3534.6176, 'train_samples_per_second': 0.682, 'train_steps_per_second': 0.088, 'total_flos': 299034638622720.0, 'train_loss': 0.8547750165385585, 'epoch': 10.0})

In [15]:
# Save weights
# model.save_pretrained("./drive/MyDrive/flan-t5-large-300-trained", from_pt=True)

In [19]:
# Test review/sentiment question
my_question = "Software wouldn't work on my pc amazon actually refunded me. Thank goodness."
inputs = "Answer this: " + my_question

In [20]:
# TEST
# Make sure to run with above block
inputs = tokenizer(inputs, return_tensors="pt")
# inputs.to(device='cuda')
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
from textwrap import fill

print(fill(answer, width=100))

<pad> Glad to hear Amazon could resolve it. Sorry for the inconvenience.</s>


In [30]:
print(model.config.to_json_string())

{
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.40.0",
  "use_cache": true,
  "vocab_size": 32128
}



In [21]:
# TEST
import psutil
import time

#memory_usage = psutil.virtual_memory()
psutil.virtual_memory()
#print(f"Memory Usage: {memory_usage.percent}%")


svmem(total=54754058240, available=38217469952, percent=30.2, used=15886278656, free=30053842944, active=5501276160, inactive=18596040704, buffers=404897792, cached=8409038848, shared=6496256, slab=332136448)