Parte 2 - Fine Tuning

In [1]:
#Import utilizado em todo o processo
!pip install transformers datasets
import json
import random
import torch

from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset, concatenate_datasets



In [2]:
# Inicializar uma lista para armazenar os títulos e conteúdos
data = []

# Carregar o json, tratando cada linha como um objeto JSON separado
# Obs. Foi utilizado uma parte do arquivo trn.json para fins de teste em um modelo completo o ideal seria treinar com todos os dados do trn.json)

with open(f'./json_parte_1.json', 'r') as f:
    for line in f:
        line = line.strip()
        try:
           data.append(json.loads(line))
        except json.JSONDecodeError as e:
           continue

data = random.sample(data, int(0.10 * len(data))) # reduzindo o dataset para teste

#Criando o prompt para o dataset
processed_data = []

for item in data:
    prompt = "What is the summary of " + item['title'] + "?"
    response = item['content']
    processed_data.append({'prompt': prompt, 'response': response})

processed_data_dict = {
    'prompt': [d['prompt'] for d in processed_data],
    'response': [d['response'] for d in processed_data]
}

In [3]:
# Carregar o tokenizer e o modelo BART

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
#Exemplo para demonstrar como a resposta é exibida antes de efetuar o fine tuning
def generate_answer(question, title, context):
    # Cria o prompt
    prompt = f"Product summary:\nTitle: {title}\nDescription: {context}\nQuestion: {question}\nAnswer:"

    # Tokenizer
    inputs = tokenizer.encode(prompt, return_tensors='pt', max_length=512, truncation=True) # Truncate the input to 512 tokens

    # Gerar resposta
    outputs = model.generate(inputs, max_new_tokens=256, num_return_sequences=1, temperature=0.7, top_k=50, repetition_penalty=1.2)

    # Decodificar resposta
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()


In [5]:
first_item = data[0]

question = "What is the summary of 'asian cinema a field guide'?"

# Gerar a resposta
answer = generate_answer(question, first_item['title'], first_item['content'])
print(f"Resposta: {answer}")



Resposta: Product summary:                Title: fooling some of the people all of the time a long short story.Description: instead of stewing in private einhorn wrote a book fooling some of the people all of the time about his sixyear ordeal with allieddaily mail.


In [6]:
# Cria o Dataset
data = Dataset.from_dict(processed_data_dict)

# Carregar o tokenizer e o modelo BART
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Função para tokenizar
def tokenize_function(examples):
    inputs = tokenizer(examples['prompt'], max_length=256, truncation=True, padding='max_length')
    targets = tokenizer(examples['response'], max_length=256, truncation=True, padding='max_length')
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'labels': targets['input_ids']}

tokenized_dataset = data.map(tokenize_function, batched=True)

# Dividir o dataset em treino e validação
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)  # 80% treino, 20% validação
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

Map:   0%|          | 0/23884 [00:00<?, ? examples/s]

In [8]:
# Definir os argumentos de treinamento
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",  # Avaliação em intervalos regulares
    logging_dir='./logs',  # Logs para o TensorBoard
    logging_steps=200,  # Log a cada 200 steps
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,  # Se aplicável
    load_best_model_at_end = True
)

# Usar um data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Inicializar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)],  # Adicionado Early Stopping
)

# Iniciar o fine-tuning
trainer.train()

trainer.save_model('./results')  # Salva o modelo treinado
tokenizer.save_pretrained('./results')  # Salva o tokenizer

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
50,3.9117,2.614517
100,2.6129,2.521091
150,2.5732,2.464893
200,2.5432,2.440614
250,2.4772,2.416838
300,2.4206,2.400212
350,2.4753,2.378654
400,2.3485,2.368775
450,2.3126,2.363389
500,2.3695,2.340403


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_

('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.json',
 './results/merges.txt',
 './results/added_tokens.json')

In [10]:
# Carregar o modelo treinado para gerar respostas
model = BartForConditionalGeneration.from_pretrained('./results')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
tokenizer.pad_token = tokenizer.eos_token  # Definir o token de padding

def generate_response(question):
    # Formatar o prompt
    prompt = f"Question: {question}\nAnswer:"

    # Tokenizar a entrada
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Gerar resposta
    output = model.generate(input_ids, max_length=128, num_return_sequences=1)

    # Decodificar e retornar a resposta
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Exemplo de uso
question = "What is the summary of 'asian cinema a field guide'?"
response = generate_response(question)
print(f"Resposta: {response}")


Resposta: a field guide to asian cinema a field guide is a mustread for anyone interested in the history of cinema in the united states or interested in film history in general    it is also a must read for anyone who wants to know more about the history and development of the cinema of asia   this is a book that will be of great interest to anyone who is interested in cinema history and cinema history   the book is well written well researched and well written and it is an excellent choice for students of cinema history as well as for those who want to learn more about cinema in general and asian
