## Data Preperation

In [None]:
# !pip install datasets
# !pip install transformers -U
# !pip install accelerate                   library that greatly speeds up training, allows us to automatically make use of multiple GPUs if available
# !pip install trl                          library that lets us instantiate our trainer and define our training arguments, highly optimized for training transformers

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
from datasets import load_dataset   # api that hugging face provides where using the appropriate string that identifies the name of a dataset, you can
# pass that into the load dataset function and it will download the dataset you desire
DATASET_NAME = "mlabonne/guanaco-llama2-1k"        # dataset of size 1000, allows us to fine tune gpt 2 into a model that can actually perform question and answering

dataset = load_dataset(DATASET_NAME)

Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 71581.26 examples/s]


In [None]:
print(dataset)      # this dataset only has "train" key, because this dataset is primarily for training, other datasets may have "test" key for testing
# the train key maps to dataset object which itself is a dictionary with the features being text (no lables just blocks of text) and it has 1000 examples

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})


In [6]:
training_dataset = dataset['train']
print(training_dataset)

Dataset({
    features: ['text'],
    num_rows: 1000
})


In [None]:
# example question / answer pair in spanish (model will learn to chat in multiple languages)
training_dataset[0]

{'text': '<s>[INST] Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? [/INST] Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar las posibilidades de encontrar trabajo, también participaría en congresos y encuentros para conseguir más contactos. Y, además de todo lo anterior, seguiría estudiando para presentarme a las oposiciones y ejercer la medicina en el sector público de mi país. </s>'}

In [None]:
# another question / answer pair, model learns from prompt and response and picks up the pattern / learn to chat back and forth with us
# we don't need labels, training examples are embedded within the block of text to help the model learn to predict the next token
training_dataset[11]

{'text': '<s>[INST] write me a 1000 words essay about deez nuts. [/INST] The Deez Nuts meme first gained popularity in 2015 on the social media platform Vine. The video featured a young man named Rodney Bullard, who recorded himself asking people if they had heard of a particular rapper. When they responded that they had not, he would respond with the phrase "Deez Nuts" and film their reactions. The video quickly went viral, and the phrase became a popular meme. \n\nSince then, Deez Nuts has been used in a variety of contexts to interrupt conversations, derail discussions, or simply add humor to a situation. It has been used in internet memes, in popular music, and even in politics. In the 2016 US presidential election, a 15-year-old boy named Brady Olson registered as an independent candidate under the name Deez Nuts. He gained some traction in the polls and even made appearances on national news programs.\n\nThe Deez Nuts meme has had a significant impact on popular culture. It has b

In [10]:
training_dataset[7]

{'text': '<s>[INST] Ich möchte für ein Handy-Game den Flug von Papierfliegern simulieren. Was für Formeln können mir helfen um das möglichst realistisch zu tun? [/INST] Um den Flug von Papierfliegern in einem Spiel realistisch zu simulieren, gibt es einige wichtige Formeln, die Sie berücksichtigen sollten:\n\nLift-Kraft: Der Lift, der einen Papierflieger in der Luft hält, wird durch die Bewegung der Luft über die Oberfläche des Flügels erzeugt. Die Formel zur Berechnung der Lift-Kraft lautet:\n\nL = 0.5 * rho * v^2 * A * Cl\n\nL = Lift-Kraft\nrho = Luftdichte\nv = Geschwindigkeit des Flugzeugs\nA = Flügeloberfläche\nCl = Auftriebsbeiwert des Flügels\n\nLuftwiderstand: Der Luftwiderstand ist die Kraft, die gegen den Flug eines Papierfliegers wirkt und durch die Bewegung der Luft entsteht, wenn sie um das Flugzeug herumströmt. Die Formel zur Berechnung des Luftwiderstands lautet:\n\nD = 0.5 * rho * v^2 * A * Cd\n\nD = Luftwiderstand\nrho = Luftdichte\nv = Geschwindigkeit des Flugzeugs\nA

In [None]:
MODEL_NAME = "distilgpt2"
import transformers
# in transformers library, there is a base class called AutoModel
# AutoModelForCausalLM is a subclass of AutoModel we're using (gpt2 and chatbots are a type of language model called causal language models) 
# They're called this because they generate one token at a time
# If we asked a causal language model to write a poem, it would generate this response one token at a time, until the final response:
# This
# This is
# This is a
# This is a funny
# This is a funny poem
# The tokens are predicted in a cause and effect way - the tokens in the past will influence the next word
from transformers import AutoModelForCausalLM       # used to automatically get the model based on the model_name
from transformers import AutoTokenizer              # automatically retrieve the right tokenizer object based on the model being used

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map = "auto")   # device_map tells the library to automatically move things to the GPU or CPU when necessary
model.config.use_cache = True   # we don't need to recompute stuff for previous tokens
# This
# This is
# This is a         -> the only new token is "a", cache previous results / hidden states associated with the prior tokens

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)   # tells library to trust code and pretrained model downloaded from internet
# during training, we feed in batches of examples at a time, and all the examples may not be the same length 
# so to make them the same length we use padding, with the padding token being the end of sentence token
# as soon as a sentence is over (no more tokens left), padding by repeating end of sentence conveys the same thing - that the sentence is over
# we need to clarify we will do our padding on the right so it goes at the end of the sequence, left pad would add end of sentence tokens to the start
# also set the integer associated with the padding token to be the same integer associated with the end of sentence token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.pad_token_id = tokenizer.eos_token_id

generation_configuration = model.generation_config
generation_configuration.pad_token_id = tokenizer.eos_token_id
generation_configuration.eos_token_id = tokenizer.eos_token_id
# max amount of tokens to be generated by model, it can't just go on forever
generation_configuration.max_new_tokens = 1024  # gpt-2 has a context length of 1024 (it can only factor in past 1024 tokens for the response)

# variables below relate to how we sample from the model, this affects the quality and how diverse the model's outputs are (super bland vs fun and varied)
# divides every raw number in the list (before they get converted to probabilities with softmax) by this constant - affects how diverse / crazy model responses are
# low temperature below 1 sharpens distribution and makes higher probs even more likely to be chosen - less diverse
# high temperature above 1 flattens distribution and gives increased chance for lower probability tokens to be chosen - more diverse
generation_configuration.temperature = 0.7
# top-p means we only consider highest probability tokens in the distribution: aggregate their sum and stop considering the rest of the tokens that makes the sum exceed p
# discard less likely tokens and don't consider them in sampling process
generation_configuration.top_p = 0.9
# only consider the k highest probability tokens during the sampling process (the top k tokens are then renormalized to sum to 1)
generation_configuration.top_k = 20

# model will output a probability distribution of which token should come next: 
# [0.2, 0.23, 0.69, 0.1, ..., 0.01, 0.14]

## Training + Testing