<a href="https://colab.research.google.com/github/aj2030/ai_repo/blob/main/exercise_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Basic installations of required packages

In [None]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0

Load Tokens to connect to hugging face

In [2]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gc

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

Declare models we need to interact with

In [3]:
# All below modesl support chat (see word 'instruct' in them)
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
QWEN2 = "Qwen/Qwen2-7B-Instruct" # exercise for you
# MIXTRAL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # If this doesn't fit it your GPU memory, try others from the hub

Interact with models

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Which is considered most dreadful natural disaster of all time?"}
  ]

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

# Instantiate tokenizer object
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
# Tokenize user prompt
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

# Instantiate model object
model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)

# Talk to model now
outputs = model.generate(inputs, max_new_tokens=800)
print(tokenizer.decode(outputs[0]))

# end of the program

In [6]:
# Some cleanup to free resources
del model, inputs, tokenizer, outputs
gc.collect()
torch.cuda.empty_cache()

In [11]:
# An API for message generation
def chat(model_name,messages):
  quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4")

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token

  # Tokenize user prompt
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

  # Instantiate model object
  model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quant_config)

  # Talk to model now
  outputs = model.generate(inputs, max_new_tokens=800)
  print(tokenizer.decode(outputs[0]))

  # Cleanup
  del model, inputs, tokenizer, outputs
  gc.collect()
  torch.cuda.empty_cache()

In [None]:
# Start the chat
messages = [
    #{"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Which is considered most dreadful natural disaster of all time?"}
  ]

chat(GEMMA2,messages)