In [1]:
# # Open a new terminal
# # Create a new conda/virtual environment using python 3.10.13 and install all the required packages given below:

# !pip install \
#     datasets==2.15.0 \
#     transformers==4.31.0 \
#     evaluate==0.4.1 \
#     torch==2.1.1 \
#     rouge_score==0.1.2 \
#     accelerate==0.25.0 \
#     sentencepiece==0.1.99 \
#     bitsandbytes==0.41.3.post1 --quiet

In [2]:
from datasets import load_dataset
from pprint import pprint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM

import evaluate
from tqdm.auto import tqdm
import torch

  from .autonotebook import tqdm as notebook_tqdm


# loading data

In [3]:
dataset_name = 'lamini/lamini_docs'
dataset = load_dataset(dataset_name)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})

# loading model and tokenizer

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
model_name = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, legacy=False)
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", device_map="auto")#, load_in_8bit=True)

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# input tokenization

In [7]:
def determine_tokens(tokenizer):
    def tokenization(data):
        prompt = """
        Answer the given question and provide detail explanation. This is important for my career. \
        Take your time, think and answer.

        Question:
        {question}

        Answer: """
    
        prompt = [prompt.format(question=question) for question in data['question']]
        tokenized_inputs = tokenizer(
            prompt,
            return_tensors='pt',
            truncation=True,
            padding=True,
        )['input_ids'].to(device)
        
        tokenized_labels = tokenizer(
            data['answer'],
            return_tensors='pt',
            truncation=True,
            padding=True,
        )['input_ids'].to(device)
        
        data['input_ids'] = tokenized_inputs
        data['labels'] = tokenized_labels

        return data
    return tokenization

In [8]:
dataset = dataset.remove_columns(['labels'])
tokenized_dataset = dataset.map(
    determine_tokens(tokenizer),
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})

# train test split 

In [10]:
train_dataset = tokenized_dataset['train'].train_test_split(test_size=0.1, shuffle=True, seed=12345)
train_dataset.shape

{'train': (1134, 5), 'test': (126, 5)}

In [11]:
test_dataset = tokenized_dataset['test']
test_dataset.shape

(140, 5)

# model training

In [12]:
train_dataset = train_dataset.remove_columns(['question', 'answer', 'attention_mask'])

In [13]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1134
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 126
    })
})

In [14]:
from transformers import TrainingArguments, Trainer
import time

model_dir = "../model/"
!mkdir -p {model_dir}

In [15]:
output_dir = f'{model_dir}/training-{str(int(time.time()))}'
max_steps = -1

training_args = TrainingArguments(
    
    # directory to save model checkpoint
    output_dir=output_dir,
    
    # learning rate
    learning_rate=1e-5,
    
    # number of training epochs
    num_train_epochs=2,
    
    # max steps to train for (each step is a batch of data)
    # overrides num_train_epochs, if not -1
    max_steps=max_steps,
    
    # batch size for training
    per_device_train_batch_size=1,
#     auto_find_batch_size=True,

#     weight_decay=0.01,
    
    # other arguments
    overwrite_output_dir=False, # Overwrite the content of the output directory
    disable_tqdm=False, # Disable progress bars
    eval_steps=120, # Number of update steps between two evaluations
    save_steps=120, # After # steps model is saved
    warmup_steps=1, # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1, # Batch size for evaluation
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=1,
    optim="adafactor",
    gradient_accumulation_steps = 4,
    gradient_checkpointing=False,

    # parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset['train'],
    eval_dataset=train_dataset['test']
)

In [16]:
%%time

trainer.train()

Step,Training Loss,Validation Loss
120,2.6346,2.254917
240,2.6412,2.168026
360,2.1092,2.111719
480,2.5278,2.086913


CPU times: user 7min 50s, sys: 5.57 s, total: 7min 55s
Wall time: 7min 53s


TrainOutput(global_step=566, training_loss=2.45397535608851, metrics={'train_runtime': 473.3441, 'train_samples_per_second': 4.791, 'train_steps_per_second': 1.196, 'total_flos': 145744985318400.0, 'train_loss': 2.45397535608851, 'epoch': 2.0})

In [17]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: ../model//training-1703085813/final


# model evaluation

In [18]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    prompt_str = """
    Answer the given question and provide detail explanation. This is important for my career. \
    Take your time, think and answer.

    Question:
    {question}

    Answer: """
    
    # Tokenize
    device = model.device
    
    prompt = [prompt_str.format(question=question) for question in text]
    input_ids = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        padding=True,
    )

    # Generate
    generated_tokens_with_prompt = model.generate(
        **input_ids.to(device),
        max_length=max_output_tokens
    )

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    return generated_text_with_prompt

In [19]:
def model_evaluation(model, tokenizer, data):
    # model prediction
    predictions = inference(data['question'], model, tokenizer)
    
    # evaluation
    rouge = evaluate.load('rouge')
    rouge_score = rouge.compute(
        predictions = predictions,
        references= data['answer'],
        use_aggregator=True,
        use_stemmer=True
    )

    return rouge_score

In [20]:
# loading the local model
model = AutoModelForSeq2SeqLM.from_pretrained(save_dir, local_files_only=True).to(device)

# model evaluation
result = model_evaluation(model, tokenizer, test_dataset)
print(result)

{'rouge1': 0.390481295660745, 'rouge2': 0.26200322663120856, 'rougeL': 0.3487057141068033, 'rougeLsum': 0.34940125211344875}


In [21]:
model_name = "lamini/lamini_docs_finetuned"

model_lamini = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')
tokenizer_lamini = AutoTokenizer.from_pretrained(model_name)
tokenizer_lamini.add_special_tokens({'pad_token': '[PAD]'})

result_lamini = model_evaluation(model_lamini, tokenizer_lamini, test_dataset)
print(result_lamini)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


{'rouge1': 0.34621192689269137, 'rouge2': 0.16426376810151316, 'rougeL': 0.2479945015628851, 'rougeLsum': 0.26975039596357875}


In [22]:
for k, v in result.items():
    pct_diff = ((result[k] - result_lamini[k]) / result_lamini[k])*100
    print(k, ":", f'{pct_diff:0.2f}% improvement')

rouge1 : 12.79% improvement
rouge2 : 59.50% improvement
rougeL : 40.61% improvement
rougeLsum : 29.53% improvement
