# Play with LLMs

## Imports

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git

!pip install -q -U optimum
!pip install -q -U git+https://github.com/huggingface/accelerate.git

#!pip install -q -U langchain
#!pip install -q -U ctransformers[cuda]
#!pip install sentence-transformers


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


## Playground Hugging-Face

Lien entre les noms de modèle et leur "adresse" sur hugging face

In [None]:
# Hugging-Face model ids
models_id = {
    ### Mistral-based ###
    "mistral7b_instruct" : "mistralai/Mistral-7B-Instruct-v0.1",
    "mistral7b_orca" : "Open-Orca/Mistral-7B-OpenOrca",
    "zephyr7b" : "HuggingFaceH4/zephyr-7b-beta",
    "vigostral7b" : "bofenghuang/vigostral-7b-chat",

#    "mistral7b_original" : "mistralai/Mistral-7B-v0.1",

    ### Llama-based ###
    "llama2-chat7b" : "meta-llama/Llama-2-7b-chat-hf",
    "llama2-chat13b" : "meta-llama/Llama-2-13b-chat-hf",

    "vigogne7b" : "bofenghuang/vigogne-2-7b-chat",
    "vigogne7b_instruct" : "bofenghuang/vigogne-2-7b-instruct", #ok pour les licenses

    "wizard7b_math" : "WizardLM/WizardMath-7B-V1.0",
    "wizard13b_math" : "WizardLM/WizardMath-13B-V1.0",

    "wizard15b_coder" : "WizardLM/WizardCoder-15B-V1.0",
    "wizard34b_coder" : "WizardLM/WizardCoder-Python-34B-V1.0",

    ###bigscience bloom (7b)
    "bloom7b" : "bigscience/bloom-7b1",
    ## GPT-neo
    #"gptNeo_original" : "EleutherAI/gpt-neo-2.7B",
    ## GPT-J
    #"gptJ_original" : "EleutherAI/gpt-j-6B",

}

Choix du modèle

In [None]:
###### Choose your model with its name ######
model_name = "mistral7b_instruct" #@param ["mistral7b_instruct", "mistral7b_orca", "zephyr7b", "vigostral7b","bloom7b", "vigogne7b_instruct", "llama2-chat7b", "llama2-chat13b", "vigogne7b","wizard7b_math", "wizard13b_math", "wizard15b_coder", "wizard34b_coder"]


Choix de la configuration pour quantifier (i.e. réduire le poids en baissant la qualité de représentation des flotants) le modèle


Pour le moment : seulement 4bits, les autres config devraient pas passer sur colab et ça va crash

In [None]:
###### Choose your quantization config ######
quant_config = "4bits" # @param ["4bits", "8bits", "16bits", "32bits"]


Chargement du modèle

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = models_id[model_name]

# quantization to int4 (don't want to mess with "device" here, to be studied)
#4bit, 4 bits = 1/2 byte --> #paramsInB * 1/2 = RAM needed to load full model
if quant_config == "4bits":
    print("Loading model in 4bits")
    #quant config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        #bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    #Load quantized model
    model = AutoModelForCausalLM.from_pretrained(
                model_id,
                quantization_config=bnb_config,
                #device_map="auto",
            )
    tokenizer = AutoTokenizer.from_pretrained(model_id)


# quantization to int8  (don't want to mess with "device" here, to be studied)
#8bit, 8 bits = 1 byte --> #paramsInB * 1 = RAM needed to load full model
elif  quant_config == "8bits":
    print("Loading model in 8bits")
    #load quantized model
    model = AutoModelForCausalLM.from_pretrained(
                model_id,
                #device_map="auto",
                load_in_8bit=True, # 8bits here
          )
    tokenizer = AutoTokenizer.from_pretrained(model_id)



#half-precision, 16 bits = 2 bytes --> #paramsInB * 2 = RAM needed to load full model
elif  quant_config == "16bits":
    print("Loading model in half-precision")
    #device-agnostic code
    device = torch.device(
                "cuda" if torch.cuda.is_available()
                else "cpu"
            )
    #load model
    model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=torch.float16, #half-precision here
                device_map="auto",
            )#.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)#.to(device)



#full-precision, 32bits = 4 bytes --> #paramsInB * 4 = RAM needed to load full model
elif  quant_config == "32bits":
    print("Loading model in full-precision")
    #device-agnostic code
    device = torch.device(
                "cuda" if torch.cuda.is_available()
                else "cpu"
            )
    #load model
    model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=torch.float16, #full-precision here
            ).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)#.to(device)


Loading model in 4bits


Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Let's goo : basic prompting with/without instruction depending on the model (chat or not)

In [None]:
# Instruction : if the model is a chat model, specify context, persona, personality, skills, ...

###
chat = True
###

instruction = "You are a passionate elementary school teacher. " + \
"You are teaching a class of 20 pupils. " + \
"You love to explain things to childrens with images they understand at their age and relevant examples."

# Prompt : your question, task, ...
prompt = "Write a math exercice around a football with a couple of multiplications."
prompt_no_chat = "Here is a small 3-examples math word problem for children aged 8 years old on basic multiplication with a football theme/story to hook them : "

text_input = instruction + prompt if chat else prompt_no_chat
inputs = tokenizer(text_input, return_tensors="pt").to(model.device)


outputs = model.generate(
    **inputs,
    #temperature=1.1, # >1 augmente la diversité/surprise sur la génération (applatie la distribution sur le next token), <1 diminue la diversité de la génération (rend la distribution + spiky)
    do_sample=False,
    top_k=5,
    top_p=10, # le token suivant est tiré du top 'top_p' de la distribution uniquement
    num_return_sequences=1,
    repetition_penalty=1.5, #pour éviter les répétitions, je suis pas au clair avec commment il marche celui-là mais important à priori
    eos_token_id=tokenizer.eos_token_id,
    max_length=1024,
    )

# display the generated answer
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

## Trying GGUF models, should be lighter and work well

Résumer des besoins hardware

- https://www.hardware-corner.net/llm-database/Vigogne/

In [2]:
# On colab
on_colab = True
if on_colab:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd "/content/drive/MyDrive/PIE - MSXS-08/4-LLMs/"

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/15XKDUxg701tiQkxuN3G7q6MCvcYEFp44/PIE - MSXS-08/4-LLMs


-  https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF
- https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF


### 1. On télécharge les poids du modèle + config qui nous intérèsse.

Par exemple un mistral7b quantifié

In [None]:
## 1. download model weights locally

#!pip3 install -q huggingface-hub
#!huggingface-cli download TheBloke/Llama-2-7b-Chat-GGUF llama-2-7b-chat.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False



Consider using `hf_transfer` for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
downloading https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf to /root/.cache/huggingface/hub/tmpc6ntp7mm
llama-2-7b-chat.Q4_K_M.gguf: 100% 4.08G/4.08G [00:38<00:00, 106MB/s] 
./llama-2-7b-chat.Q4_K_M.gguf


In [3]:
!ls

llama-2-7b-chat.Q4_K_M.gguf  mistral-7b-v0.1.Q4_K_M.gguf  playground.ipynb


On charge le modèle avec ctransformers, une librairie python pour utiliser des modèles styles ggml/gguf qu'on utilise normalement avec du c/c++

In [1]:
# 2. load the downloaded model with ctransformers
#Base ctransformers with no GPU acceleration
!pip install -q ctransformers
##Or with CUDA GPU acceleration
#pip install ctransformers[cuda]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from ctransformers import AutoModelForCausalLM
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/Mistral-7B-v0.1-GGUF",
    model_file="mistral-7b-v0.1.Q4_K_M.gguf",
    model_type="mistral",
    gpu_layers=0
)

#demo
for text in llm("AI is going to", stream=True):
    print(text, end="", flush=True)


On génère du texte. Je vais voir pour rajouter des config sur la génération pour mieux la contrôler

In [5]:
prompt_no_chat = "Here is a math word problem for children learning basic multiplications with a football theme/story to hook them : "

tok=0
for text in llm(prompt_no_chat, stream=True):
    print(text, end="", flush=True)

    # for visibility on colab
    tok+=1
    if tok % 10 == 0:
      print("\n")


2 players are playing football . One scores two goals

 and the other scores three goals. How many goals

 were scored? The answer is 5 goals and

 it’s so easy to make and write!



This is an example of the problems found in

 a Football Math Workbook for children learning the basic

 multiplications.

There are over 7

0 word problems in this workbook. All of

 them are related to football with fun illustrations to

 keep kids interested. Children will learn multiplication while

 they improve their reading comprehension skills. The answers

 are at the end of each page, so you

 can check your child’s work and give instant feedback

 on his or her performance.

We do

 sell this book in the shop as a paperback

 or PDF copy which means you have immediate access to

 it. You can also read a sample of some

 pages for free from our blog posts . We do

 offer a 30 day money back guarantee if

 you are not satisfied with your purchase.

### llama2 7b int4:

In [6]:
from ctransformers import AutoModelForCausalLM
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained(
   "TheBloke/Llama-2-7b-Chat-GGUF",
   model_file="llama-2-7b-chat.Q4_K_M.gguf",
   model_type="llama",
   gpu_layers=0,
  )

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

(…)2882fb562ffccdd1cf0f65402adb/config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

llama-2-7b-chat.Q4_K_M.gguf:   0%|          | 0.00/4.08G [00:00<?, ?B/s]

In [7]:
prompt_no_chat = "Here is a small 3-examples math word problem for children aged 8 years old on basic multiplication with a football theme/story to hook them : "

for text in llm(prompt_no_chat, stream=True):
    print(text, end="", flush=True)



Example 1: Sarah has 5 footballs and she wants to know how many she will have if she buys 3 more. Can you help her calculate the answer?
Example 2: Tom has 8 players on his football team, and each player needs 4 water bottles. How many water bottles does Tom need in total?
Example 3: If a football field is 100 yards long, and a player runs 50 yards in one direction, how far does the player run in total?

Answer key for above example:
For Example 1: Sarah will have 8 footballs if she buys 3 more.
For Example 2: Tom needs 32 water bottles in total.
For Example 3: The player runs a total of 100 yards.

### llama2-13b-chat int4

- https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF

In [None]:
# ne fonctionne pas pour une raison étrange, les adresses ont pourtant l'air correcte
#!huggingface-cli download TheBloke/Llama-2-13B-chat-GGUF llama-2-13b-chat.q4_K_M.gguf --local-dir . --local-dir-use-symlinks False

trying with gptq instead

In [None]:
!pip3 install transformers>=4.32.0 optimum>=1.12.0
!pip3 install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
# To use a different branch, change revision
# For example: revision="main"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)




In [7]:
prompt = "Please write a math exercice for 10-year-old children on basic multiplications with small digits with a football theme. I need 4 short examples of multiplications."
prompt_template=f'''[INST] <<SYS>>
You are a passionate and honest elementary school teacher. You are teaching a class of 20 pupils. You love to explain things to childrens with images they understand at their age and relevant examples. Please help with the following task.
<</SYS>>
{prompt}[/INST]

'''


pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])

[INST] <<SYS>>
You are a passionate and honest elementary school teacher. You are teaching a class of 20 pupils. You love to explain things to childrens with images they understand at their age and relevant examples. Please help with the following task.
<</SYS>>
Please write a math exercice for 10-year-old children on basic multiplications with small digits with a football theme. I need 4 short examples of multiplications.[/INST]

Hey there, young mathematicians! Today, we're going to kick off our math lesson with a fun football theme! 🏈👋

Example 1: Score a Goal! 🏆
Imagine you're playing soccer, and you score a goal! If you scored that goal using your left foot, how many times do you think you kicked the ball with your left foot? 🤔

That's right, 3 times! 🙌 And if you kicked the ball 3 times with your left foot, and each kick was 5 feet apart, how far did you kick the ball in total? 🤝

That's 15 feet! 😮 Can you imagine kicking the ball that far? Wow!

Example 2: Pass the Ball! 🏃‍♀️
No

## More advanced prompting :prompt templates using LangChain for chat systems

In [None]:
from transformers import pipeline
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain


hf_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=500,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)
llm = HuggingFacePipeline(pipeline=hf_pipeline)


#### Prompt
template = """<s>[INST] "You are a passionate elementary school teacher.
You are teaching a class of 20 pupils.
You love to explain things to childrens with images they understand at their age and relevant examples. Please help with the following task.
{context}
{question} [/INST] </s>
"""

question_p = """Writte a math problem around a football story to train them at multiplying small digits together. I need 5 examples."""
context_p = """ You are teahcing a class of a dozen 10 year-old children."""
prompt = PromptTemplate(template=template, input_variables=["question","context"])

llm_chain = LLMChain(prompt=prompt, llm=llm)
response = llm_chain.run({"question":question_p,"context":context_p})
response

In [None]:
!pip install awq

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "TheBloke/Llama-2-7b-Chat-AWQ"

# Load model
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=False, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)

prompt = "Tell me about AI"
prompt_template=f'''[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
{prompt}[/INST]

'''

print("\n\n*** Generate:")

tokens = tokenizer(
    prompt_template,
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    max_new_tokens=512
)

print("Output: ", tokenizer.decode(generation_output[0]))

# Inference can also be done using transformers' pipeline
from transformers import pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])


## Llama.cpp

In [None]:
#!git clone https://github.com/ggerganov/llama.cpp.git
#!(cd llama.cpp; make)
#!llama.cpp/main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e

