<a href="https://colab.research.google.com/github/alvivar/llama2-googlecolab/blob/main/Llama2_from_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# From Huggingface

In [13]:
# Classic transformers
!pip install transformers

# Optimizers
!pip install accelerate
!pip install xformers

# Fine tunning
!pip install trl
!pip install peft
!pip install bitsandbytes
!git clone https://github.com/lvwerra/trl

fatal: destination path 'trl' already exists and is not an empty directory.


# Downloads if needed

In [14]:
import requests
from tqdm import tqdm

def download_file_with_progress(url, destination_path):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))

    with open(destination_path, 'wb') as file, tqdm(
        desc=destination_path,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as progress_bar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            progress_bar.update(len(data))

url = "https://www.dropbox.com/s/dkj1dyc2ro7053h/pico8.psd?dl=1"
destination_path = "pico8.psd"

download_file_with_progress(url, destination_path)

pico8.psd: 100%|██████████| 55.9k/55.9k [00:00<00:00, 16.8MB/s]


# Prepare the pipeline from the pretrained model

In [16]:
from transformers import AutoTokenizer
import transformers
import torch

model = "meta-llama/Llama-2-7b-chat-hf"

auth_token = ""
with open('access_token.txt', 'r') as file:
    auth_token = file.read().strip()

tokenizer = AutoTokenizer.from_pretrained(model, token=auth_token)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# The prompt

In [None]:
sequences = pipeline(
    'Enumerate countries ranked by diversity.\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=256,
)

for seq in sequences:
    print(f"Result: {seq['generated_text']}")

# Finetunning with SFTTrainer from trl

In [7]:
# timdettmers/openassistant-guanaco
# roneneldan/TinyStories
# Fraser/mnist-text-small

!python trl/examples/scripts/sft_trainer.py \
    --model_name meta-llama/Llama-2-7b-hf \
    --dataset_name Fraser/mnist-text-small \
    --load_in_4bit \
    --use_peft \
    --batch_size 4 \
    --gradient_accumulation_steps 2


Loading checkpoint shards:   0% 0/2 [00:04<?, ?it/s]
Traceback (most recent call last):
  File "/content/trl/examples/scripts/sft_trainer.py", line 80, in <module>
    model = AutoModelForCausalLM.from_pretrained(
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained
    return model_class.from_pretrained(
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 2903, in from_pretrained
    ) = cls._load_pretrained_model(
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model
    new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 725, in _load_state_dict_into_meta_model
    set_module_quantized_tensor_to_device(
  File "/usr/local/lib/python3.10/dist-packages/transformers/utils/bitsandbytes.py", line 99, in set_mo

# Prompt after

In [9]:
sequences = pipeline(
    'Enumerate countries ranked by diversity.\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=256,
)

for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: Enumerate countries ranked by diversity.
1. Colombia - with more than 1,800 ethnic groups, Colombia has the highest variety of ethnic groups in the world.
2. Brazil - Brazil, the fifth most populous country in the world, has more than 300 ethnic groups.
3. Papua New Guinea - home to about 8% of the world's languages, which include more than 800 indigenous languages.
4. Indonesia - Indonesia has 583 ethnic groups.
5. Nigeria - Nigeria is home to about 250 ethnic groups.
