<a href="https://colab.research.google.com/github/alvivar/llama2-googlecolab/blob/main/Llama2_from_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# From Huggingface

In [1]:
# Classic transformers
!pip install --upgrade huggingface-hub
!pip install --upgrade transformers

# Optimizers
!pip install accelerate
!pip install xformers

# Fine tunning
!pip install trl
!pip install peft
!pip install bitsandbytes
!git clone https://github.com/lvwerra/trl

Collecting huggingface-hub
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/268.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/268.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub
Successfully installed huggingface-hub-0.16.4
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB

# Auth

In [2]:
# https://huggingface.co/settings/tokens

import huggingface_hub
huggingface_hub.login("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")

# !huggingface-cli login

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Downloads if needed

In [3]:
import requests
from tqdm import tqdm

def download_file_with_progress(url, destination_path):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))

    with open(destination_path, 'wb') as file, tqdm(
        desc=destination_path,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as progress_bar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            progress_bar.update(len(data))

url = "https://www.dropbox.com/s/dkj1dyc2ro7053h/pico8.psd?dl=1"
destination_path = "pico8.psd"

download_file_with_progress(url, destination_path)

pico8.psd: 100%|██████████| 55.9k/55.9k [00:00<00:00, 5.20MB/s]


# Prepare the pipeline from the pretrained model

In [None]:
from transformers import AutoTokenizer
import transformers
import torch

model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto"
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/770 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/635 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

# The prompt

In [10]:
sequences = pipeline(
    'Enumerate countries ranked by diversity.\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=256,
)

for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: Enumerate countries ranked by diversity.
List of countries by cultural diversity (UNESCO)
List of countries by linguistic diversity (UNESCO)
List of countries by religious diversity (Pew Research Center)
Retrieved from "https://en.wikipedia.org/w/index.php?title=List_of_countries_by_diversity&oldid=1114306859"


# Finetunning with SFTTrainer from trl

In [7]:
# timdettmers/openassistant-guanaco
# roneneldan/TinyStories
# Fraser/mnist-text-small

!python trl/examples/scripts/sft_trainer.py \
    --model_name meta-llama/Llama-2-7b-hf \
    --dataset_name Fraser/mnist-text-small \
    --load_in_4bit \
    --use_peft \
    --batch_size 4 \
    --gradient_accumulation_steps 2


Loading checkpoint shards:   0% 0/2 [00:04<?, ?it/s]
Traceback (most recent call last):
  File "/content/trl/examples/scripts/sft_trainer.py", line 80, in <module>
    model = AutoModelForCausalLM.from_pretrained(
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained
    return model_class.from_pretrained(
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 2903, in from_pretrained
    ) = cls._load_pretrained_model(
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model
    new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 725, in _load_state_dict_into_meta_model
    set_module_quantized_tensor_to_device(
  File "/usr/local/lib/python3.10/dist-packages/transformers/utils/bitsandbytes.py", line 99, in set_mo

# Prompt after

In [9]:
sequences = pipeline(
    'Enumerate countries ranked by diversity.\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=256,
)

for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: Enumerate countries ranked by diversity.
1. Colombia - with more than 1,800 ethnic groups, Colombia has the highest variety of ethnic groups in the world.
2. Brazil - Brazil, the fifth most populous country in the world, has more than 300 ethnic groups.
3. Papua New Guinea - home to about 8% of the world's languages, which include more than 800 indigenous languages.
4. Indonesia - Indonesia has 583 ethnic groups.
5. Nigeria - Nigeria is home to about 250 ethnic groups.
