In [None]:
#@title Setup cell -- Run me first!

!pip install --upgrade huggingface_hub datasets transformers accelerate>=0.21.0 better-profanity --quiet
!apt install git-lfs --quiet

import transformers
from better_profanity import profanity as p
import builtins
import io
import textwrap


class PrintWrapper(io.StringIO):
  def __call__(self, *args, **kwargs):
    args = list(args)
    if len(args) > 0:
        if isinstance(args[0], str):
            args[0] = p.censor(args[0])
            args = tuple(args)
    return builtins.print(*args, **kwargs)

print = PrintWrapper()

In [None]:
#@title Logging in to Hugging Face (used later to let you save your model!)

from huggingface_hub import notebook_login

notebook_login()

# Fine-tuning a language model

What is fine tuning? Think of sculpting a large block of clay into a specific structure. The block represents a foundation for our sculpture -- something that has many potential uses, and also shapable into whatever we can imagine. Carving out our sculpture is then the process of fine-tuning: taking our generally useful material and giving it a specific purpose.

In the context of machine learning, a similar concept to this foundation exists: people with lots of time and money have trained some really, really large models (large = number of free parameters) on absolutely massive datasets. Models trained at this scale seem to be able to match human performance on many benchmarks, possibly by capturing some generalisable concepts about language and knowledge. As such, they've been termed "**foundation models**", which comes from their ability to serve as a good foundation for fine-tuning on specific tasks.

A common starting point for language tasks is [OpenAI's GPT2](https://huggingface.co/gpt2), which has been trained on a large body of text scraped from the internet. There are also many other popular models from various teams and companies that are openly available on the [HuggingFace model hub](https://huggingface.co/models) that would work well for a variety of use cases (e.g. image generation, text classification, language translation etc.).

If you've read a bit about [ChatGPT](https://chat.openai.com) and are wondering why we're not using GPT3/3.5 or GPT4, the answer is that neither model is *open-source*, meaning that the model parameters were not released to the general public. Even if they were though, they would likely not fit in memory since they're just that large -- I didn't see a specific number, but GPT3 size estimates are from 300-800GB (for context, this notebook has a GPU with memory of 12GB, and the best cards out there only go to 50-100GB, so you'd have to link many GPUs together to load the model!)

So, how is fine-tuning done? Broadly speaking, there are a few key questions we need to answer:
- What's the *goal* of my task?
- What *data* can I use to give examples of this task?
- Which *model* should I choose to fine-tune?

In this notebook, we'll fix the answer to the first question as *text generation*. The goal there is usually to predict the next character/word/sentence in a sequence to match as close as possible with examples of text that you give to the model. However, the *type* of text we generate is completely up to you -- it all depends on the examples you show the model!

In [None]:
#@title Select your model! We'll start with a "distilled" version of GPT-2 (click the drop-down text box for more choices, or look [on the huggingface hub](https://huggingface.co/models?pipeline_tag=text-generation))
model_checkpoint = "distilgpt2" #@param {type:"string"}
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
from transformers import pipeline
pipe = pipeline(model=model_checkpoint, tokenizer=tokenizer, device="cuda")

In [None]:
#@title Let's see how well our model can autocomplete text (change it to whatever you want!):
text_to_complete = "the quick brown fox jumped" #@param {type: "string"}
num_predictions = 3 #@param {type: "integer"}
max_new_tokens = 100 #@param {type: "integer"}
for i, a in enumerate(pipe(text_to_complete, num_return_sequences=num_predictions, return_full_text=True, max_new_tokens=max_new_tokens)):
    print(f'\r \n Prediction {i+1}: ' + 2*'\n' + a['generated_text']+ '\n', end='')

# Choosing your dataset

There's a couple of ways we've let you input data for this model to be trained on. For now, we've pre-prepared some datasets from HuggingFace on [Amazon reviews](https://huggingface.co/datasets/SetFit/amazon_reviews_multi_en) and [recipes](https://huggingface.co/datasets/m3hrdadfi/recipe_nlg_lite), which you can choose from the drop-down menu.

It's also set up to try to download any HuggingFace dataset, so you can even try your own!

In [None]:
#@title Run me when you've selected your dataset!
huggingface_dataset_name = 'amazon_reviews_multi_en' #@param ["amazon_reviews_multi_en", "m3hrdadfi/recipe_nlg_lite"] {allow-input: true}
from datasets import load_dataset


if huggingface_dataset_name == "amazon_reviews_multi_en":
    datasets = load_dataset("SetFit/amazon_reviews_multi_en")
    # def filter_condition(data):
    #     condition = data["stars"] == 5  # edit this line!
    #     return condition
    # datasets = datasets.filter(filter_condition)


elif huggingface_dataset_name == "brianarbuckle/cocktail_recipes":
    datasets = load_dataset("brianarbuckle/cocktail_recipes")
    rs = []
    for partition in datasets:
        for data in datasets[partition]:
            recipe_text = f"Cocktail: {data['title']}\n\n"

            recipe_text += "Ingredients:\n"
            for ingredient in data['ingredients']:
                recipe_text += f"- {ingredient}\n"

            recipe_text += "\nDirections:\n"
            for step, direction in enumerate(data['directions'], start=1):
                recipe_text += f"Step {step}: {direction}\n"
            rs.append(recipe_text)
        datasets[partition] = datasets[partition].add_column('long_recipe', rs)

elif huggingface_dataset_name == "m3hrdadfi/recipe_nlg_lite":
    datasets = load_dataset("m3hrdadfi/recipe_nlg_lite")
    print("download complete!")
    for partition in datasets:
        rs= []

        for data in datasets[partition]:
            full_recipe = f"Recipe: {data['name']}\n\n"
            full_recipe += f"Description: {data['description']}\n\n"
            full_recipe += f"Ingredients:\n{data['ingredients']}\n\n"
            full_recipe += f"Steps:\n{data['steps']}\n\n"
            rs.append(full_recipe)
        datasets[partition] = datasets[partition].add_column('full_recipe', rs)
else:
    try:
        datasets = load_dataset(huggingface_dataset_name)
    except Exception as e:
        print(e)
        msg = f"Error: {huggingface_dataset_name} not found on HuggingFace datasets"
        raise Exception(msg)








## Uploading your own data as a text file

### NOTE: THIS WILL OVERRIDE THE PREVIOUS CELL!

If you have your own data to hand, you can upload it to this Colab notebook and provide the filename to load it in! Just copy and paste your text in a `.txt` document, then click into the "Files" tab in the top right, and drag it into the base directory (not into a folder).

In [None]:
from datasets import Dataset, DatasetDict
your_file_name = 'game-of-thrones.txt' #@param {type: "string"}

try:
    with open(your_file_name, 'r+') as file:
        text = [x+'\n' for x in file.read().splitlines() if x.strip() !='']
    my_data = dict(number=list(range(len(text))), line=text)
    datasets = DatasetDict(dict(train=Dataset.from_dict(my_data)))

except Exception as e:
    raise(e)

In [None]:
#@title See some example paragraphs from your training data:

from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def print_newlines(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    print_newlines(df)

show_random_elements(datasets["train"])

In [None]:
#@title Select name of the column in your dataset that contains the text you want to use as training material

text_column = 'text'  #@param {type:"string"}
def tokenize_function(examples):
    return tokenizer(examples[text_column])

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=datasets['train'].column_names)
# block_size = tokenizer.model_max_length
block_size = 100
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
#@title Set up your hyperparameters for training, then run this cell!
from transformers import AutoModelForCausalLM
from datasets import Dataset
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
from transformers import Trainer, TrainingArguments
my_model_name = "my-great-gpt2-review-model-katie" #@param {type:"string"}
learning_rate = 0.000891 #@param {type:"slider", min:0.000001, max:0.01, step:0.00001}
weight_decay=0.01
num_train_epochs=0.3 #@param {type:"slider", min:0.1, max:20, step:0.1}
training_args = TrainingArguments(
    my_model_name,
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
)
if "validation" in datasets:
    eval_dataset=lm_datasets["validation"]
else:
    eval_dataset=Dataset.from_dict(lm_datasets["train"][:100])
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=eval_dataset
)

In [None]:
#@title You can evaluate the model "perplexity" -- lower should mean it can generate things more like the data you trained on
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
#@title Run the training!

trainer.train()

In [None]:
#@title Hopefully, the perplexity should be less -- run this cell to find out!
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
#@title Try out your fine-tuned model!
pipe = pipeline(model=trainer.model.to('cpu'), tokenizer=tokenizer, task='text-generation')
text_to_complete = "How do i make ramen?" #@param {type: "string"}
print()
[print(f'\r \n Prediction {i+1}: ' + '\n' + a['generated_text']+ '\n', end='') for i, a in enumerate(pipe(text_to_complete, num_return_sequences=4, return_full_text=True, max_new_tokens=250))];

In [None]:
#@title If you're happy, you can save your model to the huggingface hub by running this cell:
trainer.push_to_hub(my_model_name)
tokenizer.push_to_hub(my_model_name)