In [None]:
!pip install -U accelerate bitsandbytes datasets transformers peft trl sentencepiece wandb langchain

In [None]:
!pip show accelerate bitsandbytes datasets transformers peft trl sentencepiece wandb langchain

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Monitoring with Weights and Wiases (W&B)

Weights and Biases (W&B) is an MLOps platform that can help developers monitor and document ML training workflows from end to end. As mentioned earlier, we will use W&B to get an idea of how well the training is working and if the model is improving over time. For W&B, we need to name the project; alternatively, we can use wandb's init() method:

In [3]:
import os
os.environ["WANDB_PROJECT"] = "finetuning"

In [4]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [17]:
if wandb.run is not None:
  wandb.finish()

# Hugging Face dataset

Next, we’ll need to choose a dataset for our work. We can use lots of different datasets here that are appropriate for coding, storytelling, tool use, SQL generation, grade-school math questions (GSM8k), or many other tasks. Hugging Face provides a wealth of datasets, which can be viewed at this URL: https://huggingface.co/datasets. These cover a lot of different and even the most niche tasks.

In this recipe, we are fine-tuning for question-answering performance with the `Squad V2` dataset. You can see a detailed [dataset description on Hugging Face](https://huggingface.co/spaces/evaluate-metric/squad_v2)

In [6]:
from datasets import load_dataset

dataset_name = "squad_v2"
dataset = load_dataset(dataset_name, split="train")
eval_dataset = load_dataset(dataset_name, split="validation")

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [7]:
load_dataset(dataset_name)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [8]:
dataset.features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}

# Hugging Face open-source models

We want a small model that we can run locally at a decent token rate. `LLaMa-2` models require signing a license agreement with your email address and getting confirmed (which, to be fair, can be very fast), as it comes with restrictions for commercial use. LLaMa derivatives such as OpenLLaMa have performed quite well, as can be evidenced on the [HF leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
`OpenLLaMa` version 1 cannot be used for coding tasks, because of the tokenizer. Therefore, let’s use v2! We’ll use a 3B parameter model, which we’ll be able to use even on older hardware.


In [9]:
model_id = "openlm-research/open_llama_3b_v2"

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
device_map="auto"

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
base_model.config.use_cache = False

In [11]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# LoRA fine-tuning

Now, we’ll define our training configuration. We’ll set up LORA and other training arguments:

In [None]:
from transformers import AutoTokenizer, TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig
from trl import SFTTrainer

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# if you don't want to use google drive, just set this to a directory on your computer:
output_dir = "/content/gdrive/My Drive/results"
new_model_name = f"openllama-3b-peft-{dataset_name}"

training_args = TrainingArguments(
    output_dir=output_dir,  # use new_model_name if saving to hf
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    max_steps=2000,  # training can still improve after many steps!
    num_train_epochs=100,
    evaluation_strategy="steps",
    eval_steps=5,  # update steps between two evaluations
    save_total_limit=5,  # only last 5 models are saved
    push_to_hub=False,  # you can set this to true if you want to upload your model to huggingspace
    load_best_model_at_end=True,  # to use in combination with early stopping
    report_to="wandb"
)

max_seq_length = 512

trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="question",  # this depends on the dataset!
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=200)]
)

trainer.train()

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [None]:
trainer.model.save_pretrained(
    os.path.join(output_dir, "final_checkpoint"),
)

In [None]:
trainer.model.push_to_hub(
    repo_id=new_model_name
)

In [None]:
# usually, the peft model is stored as an adapter, not as a full model, therefore the loading is a bit different:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline

model_id = 'openlm-research/open_llama_3b_v2'
config = PeftConfig.from_pretrained("benji1a/openllama-3b-peft-squad_v2")
model = AutoModelForCausalLM.from_pretrained(model_id)
model = PeftModel.from_pretrained(model, "benji1a/openllama-3b-peft-squad_v2")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256
)
llm = HuggingFacePipeline(pipeline=pipe)

# Commercial models

So far, we’ve shown how to fine-tune and deploy an open-source LLM. Some commercial models can be fine-tuned on custom data as well.


Fine-tuning a PaLM model for text classification can be done like this:

In [None]:
from skllm.models.palm import PaLMClassifier
clf = PaLMClassifier(n_update_steps=100)
clf.fit(X_train, y_train) # y_train is a list of labels
labels = clf.predict(X_test)


Similarly, you can fine-tune the GPT-3.5 model for text classification like this:

In [None]:
from skllm.models.gpt import GPTClassifier
clf = GPTClassifier(
        base_model = "gpt-3.5-turbo-0613",
        n_epochs = None, # int or None. When None, will be determined automatically by OpenAI
        default_label = "Random", # optional
)
clf.fit(X_train, y_train) # y_train is a list of labels
labels = clf.predict(X_test)


# Prompt engineering


## Zero-shot prompting

Zero-shot prompting, as opposed to few-shot prompting (discussed next), involves feeding task instructions directly to an LLM without providing any demonstrations or examples. This prompt tests the capabilities of the pre-trained model to understand and follow the instructions:


In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI


model = ChatOpenAI()
prompt = PromptTemplate(input_variables=["text"], template="Classify the sentiment of this text: {text}")
chain = prompt | model
print(chain.invoke({"text": "I hated that movie, it was terrible!"}))

## Few-shot learning

Few-shot learning presents the LLM with just a few input-output examples relevant to the task, without explicit instructions.

We provide it with a few examples:

In [None]:
examples = [{
    "input": "I absolutely love the new update! Everything works seamlessly.",
    "output": "Positive",
    },{
    "input": "It's okay, but I think it could use more features.",
    "output": "Neutral",
    }, {
    "input": "I'm disappointed with the service, I expected much better performance.",
    "output": "Negative"
}]


In [None]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_openai import ChatOpenAI


example_prompt = PromptTemplate(
    template="{input} -> {output}",
    input_variables=["input", "output"],
)
prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="Question: {input}",
    input_variables=["input"]
)
print((prompt | ChatOpenAI()).invoke({"input": " This is an excellent book with high quality explanations."}))


## Chain-of-thought prompting

`CoT` prompting aims to encourage reasoning by getting the model to provide intermediate steps, leading to the definitive answer. This is done by prefixing the prompt with instructions to show its thinking. There are two variants of CoT, `zero-shot` and `few-shot`. 

In zero-shot CoT prompting, we just add the instruction “Let’s think step by step!” to the prompt.

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI


reasoning_prompt = "{question}\nLet's think step by step!"
prompt = PromptTemplate(
  template=reasoning_prompt,
  input_variables=["question"]
)
model = ChatOpenAI()
chain = prompt | model
print(chain.invoke({
   "question": "There were 5 apples originally. I ate 2 apples. My friend gave me 3 apples. How many apples do I have now?",
}))


`Few-shot CoT` prompting is a few-shot prompt, where the reasoning is explained as part of the example solutions, with the idea to encourage an LLM to explain its reasoning before deciding. If we go back to the few-shot examples from earlier, we can extend them as follows:


In [None]:
examples = [{
    "input": "I absolutely love the new update! Everything works seamlessly.",
    "output": "Love and absolute works seamlessly are examples of positive sentiment. Therefore, the sentiment is positive",
    },{
    "input": "It's okay, but I think it could use more features.",
    "output": "It's okay is not an endorsement. The customer further thinks it should be extended. Therefore, the sentiment is neutral",
    }, {
    "input": "I'm disappointed with the service, I expected much better performance.",
    "output": "The customer is disappointed and expected more. This is negative"
}]


## Self-consistency

With self-consistency prompting, the model generates multiple candidate answers to a question. These are then compared against each other, and the most consistent or frequent answer is selected as the final output.


In [None]:
# In the first step, we’ll create multiple solutions to a question or a problem:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI


solutions_template = """
Generate {num_solutions} distinct answers to this question:
{question}


Solutions:
"""
solutions_prompt = PromptTemplate(
   template=solutions_template,
   input_variables=["question", "num_solutions"]
)
solutions_chain = LLMChain(
   llm=ChatOpenAI(),
   prompt=solutions_prompt,
   output_key="solutions"
)

# Then, we want to count the different answers. We can use an LLM again:
consistency_template = """
For each answer in {solutions}, count the number of times it occurs. Finally, choose the answer that occurs most.


Most frequent solution: 
"""
consistency_prompt = PromptTemplate(
   template=consistency_template,
   input_variables=["solutions"]
)
consistency_chain = LLMChain(
   llm=ChatOpenAI(),
   prompt=consistency_prompt,
   output_key="best_solution"
)

#Now, let’s put these two chains together with a SequentialChain. This runs two chains one by one in sequence. The first chain asks a question several times, and the second chain takes the answers of the first chain and outputs the final answer by choosing the answer that occurs most.
from langchain.chains import SequentialChain


answer_chain = SequentialChain(
   chains=[solutions_chain, consistency_chain],
   input_variables=["question", "num_solutions"],
   output_variables=["best_solution"]
)

#As the last step, let’s ask a simple question and check the answer:
print(answer_chain.invoke(
   question="Which year was the Declaration of Independence of the United States signed?",
   num_solutions="5"
))


# Tree-of-thought

In`Tree-of-Thought (ToT)` prompting, we generate multiple problem-solving steps or approaches for a given prompt and then use the AI model to critique them. 

In [None]:
#First, we’ll define our four chain components with PromptTemplate. We need a solution template, an evaluation template, a reasoning template, and a ranking template. Let’s first generate solutions:
solutions_template = """
Generate {num_solutions} distinct solutions for {problem}. Consider factors like {factors}.


Solutions:
"""
solutions_prompt = PromptTemplate(
   template=solutions_template,
   input_variables=["problem", "factors", "num_solutions"]
)
Let’s ask the LLM to evaluate these solutions:
evaluation_template = """
Evaluate each solution in {solutions} by analyzing pros, cons, feasibility, and probability of success.


Evaluations:
"""
evaluation_prompt = PromptTemplate(
  template=evaluation_template,
  input_variables=["solutions"] 
)

# After this, we want to reason a bit more about them:
reasoning_template = """
For the most promising solutions in {evaluations}, explain scenarios, implementation strategies, partnerships needed, and handling potential obstacles.


Enhanced Reasoning:
"""
reasoning_prompt = PromptTemplate(
  template=reasoning_template,
  input_variables=["evaluations"]
)

#Finally, we can rank these solutions given our reasoning so far:
ranking_template = """
Based on the evaluations and reasoning, rank the solutions in {enhanced_reasoning} from most to least promising.


Ranked Solutions:
"""
ranking_prompt = PromptTemplate(
  template=ranking_template,
  input_variables=["enhanced_reasoning"]
)


Next, we create chains from these templates before we put the chains all together:


In [None]:
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI


solutions_chain = LLMChain(
   llm=ChatOpenAI(),
   prompt=solutions_prompt,
   output_key="solutions"
)
evalutation_chain = LLMChain(
   llm=ChatOpenAI(),
   prompt=evaluation_prompt,
   output_key="evaluations"
)
reasoning_chain = LLMChain(
   llm=ChatOpenAI(),
   prompt=reasoning_prompt,
   output_key="enhanced_reasoning"
)
ranking_chain = LLMChain(
   llm=ChatOpenAI(),
   prompt=ranking_prompt,
   output_key="ranked_solutions"
)


Please note how each output_key corresponds to an input_key in the prompt of the following chain. Finally, we connect these chains into a `SequentialChain`:


In [None]:
from langchain.chains import SequentialChain

tot_chain = SequentialChain(
   chains=[solutions_chain, evalutation_chain, reasoning_chain, ranking_chain],
   input_variables=["problem", "factors", "num_solutions"],
   output_variables=["ranked_solutions"]
)
print(tot_chain.run(
   problem="Prompt engineering",
   factors="Requirements for high task performance, low token use, and few calls to the LLM",
   num_solutions=3
))
