<a href="https://colab.research.google.com/github/arishp/veltech_genai/blob/main/gpt_2_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets accelerate torchvision

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset

In [None]:
# https://huggingface.co/datasets/hakurei/open-instruct-v1
dataset = load_dataset("hakurei/open-instruct-v1", split='train')
print(dataset.to_pandas().sample(5))

In [4]:
def preprocess(example):
    example['prompt'] = f"{example['instruction']} {example['input']} {example['output']}"
    return example

In [5]:
print(f"Before preprocessing: {dataset}")

Before preprocessing: Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 498813
})


In [None]:
dataset = dataset.map(preprocess, remove_columns=['instruction', 'input', 'output'])
print(f"After preprocessing: {dataset}")

In [None]:
dataset = dataset.shuffle(seed=42).select(range(100000)).train_test_split(test_size=0.1)
print(f"After train test split: {dataset}")

In [8]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [None]:
# https://huggingface.co/microsoft/DialoGPT-medium
MODEL_NAME = 'microsoft/DialoGPT-medium'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(f"First row Before tokenizing: {train_dataset['prompt'][0]}")

In [11]:
def tokenize_dataset(dataset):
    tokenized_dataset = dataset.map(lambda example:tokenizer(example['prompt'], truncation=True, max_length=128), batched=True, remove_columns=['prompt'])
    return tokenized_dataset

In [None]:
train_dataset = tokenize_dataset(train_dataset)
test_dataset = tokenize_dataset(test_dataset)
print(f"tokenized train dataset: {train_dataset}")
print(f"First row After tokenizing: {train_dataset['input_ids'][0], train_dataset['attention_mask'][0]}")

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir='./dialogpt2-instruct',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

In [None]:
print("training started...")
trainer.train()
print("training completed...")
trainer.save_model()
print("saved model...")

In [25]:
# fine_tuned_model = AutoModelForCausalLM.from_pretrained('./dialogpt2-instruct').to('cuda')
fine_tuned_model = AutoModelForCausalLM.from_pretrained('TheFuzzyScientist/diabloGPT_open-instruct') #.to('cuda')

In [26]:
def generate_text(prompt, model_selected):
    inputs = tokenizer.encode(prompt, return_tensors='pt') #.to('cuda')
    outputs = model_selected.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(outputs[0], skip_special_token=True, use_mps_device=True)
    return generated_text[: generated_text.rfind('.')+1]

In [29]:
print("Generating text from base model... ")
print(generate_text("I like to drink...", model))

Generating text from base model... 
I like to drink...


In [28]:
print("Generating text from fine tuned model... ")
print(generate_text("I like to drink...", fine_tuned_model))

Generating text from fine tuned model... 
I like to drink...  I like to drink coffee.  I like to drink tea.  I like to drink coffee with milk.  I like to drink tea with milk.  I like to drink coffee with milk.  I like to drink tea with milk.  I like to drink coffee with milk.
