# Finetune Hugging Face Model 

### Load Dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="../../static/csv/sales_policy_data.csv")

### Load the model

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


### Tokenize the dataset

In [4]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    prompt = example["prompt"]
    completion = example["completion"]
    inputs = tokenizer(prompt, truncation=True, max_length=15, padding="max_length")
    labels = tokenizer(completion, truncation=True, max_length=15, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
model.resize_token_embeddings(len(tokenizer))


Embedding(50257, 768)

### Set up training arguments

In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../../static/models/results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4
)

### Define a trainer

In [6]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"]
)

### Finetune the model

In [7]:
trainer.train()

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 8.133882522583008, 'eval_runtime': 0.3829, 'eval_samples_per_second': 70.516, 'eval_steps_per_second': 18.282, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 8.176210403442383, 'eval_runtime': 0.3467, 'eval_samples_per_second': 77.875, 'eval_steps_per_second': 20.19, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 8.303587913513184, 'eval_runtime': 0.3705, 'eval_samples_per_second': 72.865, 'eval_steps_per_second': 18.891, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 8.332597732543945, 'eval_runtime': 0.2878, 'eval_samples_per_second': 93.825, 'eval_steps_per_second': 24.325, 'epoch': 4.0}
{'train_runtime': 13.6846, 'train_samples_per_second': 7.892, 'train_steps_per_second': 2.046, 'train_loss': 8.252579280308314, 'epoch': 4.0}


TrainOutput(global_step=28, training_loss=8.252579280308314, metrics={'train_runtime': 13.6846, 'train_samples_per_second': 7.892, 'train_steps_per_second': 2.046, 'total_flos': 826744320000.0, 'train_loss': 8.252579280308314, 'epoch': 4.0})

### Save the finetune model

In [8]:
model.save_pretrained("../../static/models/fine_tuned_gpt2_model")
tokenizer.save_pretrained("../../static/models/fine_tuned_gpt2_model")

('../../static/models/fine_tuned_gpt2_model/tokenizer_config.json',
 '../../static/models/fine_tuned_gpt2_model/special_tokens_map.json',
 '../../static/models/fine_tuned_gpt2_model/vocab.json',
 '../../static/models/fine_tuned_gpt2_model/merges.txt',
 '../../static/models/fine_tuned_gpt2_model/added_tokens.json',
 '../../static/models/fine_tuned_gpt2_model/tokenizer.json')

### Use the finetuned model for inference

In [9]:
from transformers import pipeline

fine_tuned_gpt2_model = AutoModelForCausalLM.from_pretrained("../../static/models/fine_tuned_gpt2_model")
fine_tuned_gpt2_tokenizer = AutoTokenizer.from_pretrained("../../static/models/fine_tuned_gpt2_model")

generator = pipeline("text-generation", model=fine_tuned_gpt2_model, tokenizer=fine_tuned_gpt2_tokenizer, device=0, truncation=True)

new_query = "Are discounts available for bulk purchases?"
response = generator(new_query, max_length=50, num_return_sequences=1)


In [10]:
response_text = response[0]["generated_text"]
purified_response = response_text.replace(new_query, "").strip()
print(purified_response)

$3.85 per order in advance to be picked up by the store within 90 days. In most cases the discount will be used on a per-item basis for the buyer's first $11 or $14
