In [2]:
# For loading model get access from the hugging face and load the token
import os
from dotenv import load_dotenv
# Load environment variables from the .env file
load_dotenv(override=True)
hf_access_token = os.environ["hf_access_token"]

from huggingface_hub import login
login(token=hf_access_token)

# Download Gemma 2 2B IT Model

In [1]:
# pip install accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [4]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it",device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**input_ids, max_new_tokens=32)
print(tokenizer.decode(outputs[0]))

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


<bos>Write me a poem about Machine Learning.

A tapestry of data, woven tight,
A million threads of knowledge, shining bright.
Algorithms dance, a rhythmic sway,
Learning patterns, come


In [6]:
# Define the target directory where you want to move the model
target_path = os.path.expanduser("~/llm-models/gemma2_2b_model_transformer")

# Create the target directory if it does not exist
os.makedirs(target_path, exist_ok=True)

In [7]:
# Save the tokenizer and model to a directory
tokenizer.save_pretrained(target_path)
model.save_pretrained(target_path)

# Convert to BIN

In [1]:
import mediapipe as mp
from mediapipe.tasks.python.genai import converter

In [2]:
INPUT_CKPT = '/teamspace/studios/this_studio/llm-models/gemma2_2b_model_transformer'
CKPT_FORMAT='safetensors'
MODEL_TYPE='GEMMA_2B'
combine_file_only=False
BACKEND = "gpu"
VOCAB_MODEL_FILE = '/teamspace/studios/this_studio/llm-models/gemma2_2b_model_transformer'
OUTPUT_DIR = '/teamspace/studios/this_studio/converted-bin-models/gemma2/'
OUTPUT_TFLITE_FILE = '/teamspace/studios/this_studio/converted-bin-models/gemma2/gemma2.bin'

In [3]:
config = converter.ConversionConfig(
  input_ckpt=INPUT_CKPT,
  ckpt_format=CKPT_FORMAT,
  model_type=MODEL_TYPE,
  backend=BACKEND,
  output_dir=OUTPUT_DIR,
  combine_file_only=combine_file_only,
  vocab_model_file=VOCAB_MODEL_FILE,
  output_tflite_file=OUTPUT_TFLITE_FILE,
)

In [4]:
converter.convert_checkpoint(config)

I0000 00:00:1734981180.633455    2171 model_ckpt_util.cc:337] Inserting empty buffer to TfLite.


# Converting to Quantized Model

In [5]:
INPUT_CKPT = '/teamspace/studios/this_studio/llm-models/gemma2_2b_model_transformer'
CKPT_FORMAT='safetensors'
MODEL_TYPE='GEMMA_2B'
combine_file_only=False
BACKEND = "gpu"
VOCAB_MODEL_FILE = '/teamspace/studios/this_studio/llm-models/gemma2_2b_model_transformer'
OUTPUT_DIR = '/teamspace/studios/this_studio/converted-bin-models/gemma2-quantized/'
OUTPUT_TFLITE_FILE = '/teamspace/studios/this_studio/converted-bin-models/gemma2-quantized/gemma2_2b_4bq.bin'

In [6]:
config = converter.ConversionConfig(
  input_ckpt=INPUT_CKPT,
  ckpt_format=CKPT_FORMAT,
  model_type=MODEL_TYPE,
  backend=BACKEND,
  output_dir=OUTPUT_DIR,
  combine_file_only=combine_file_only,
  vocab_model_file=VOCAB_MODEL_FILE,
  output_tflite_file=OUTPUT_TFLITE_FILE,
  attention_quant_bits=4,
  feedforward_quant_bits=4,
  embedding_quant_bits=8,
)

In [7]:
converter.convert_checkpoint(config)

I0000 00:00:1734982389.981357    2171 model_ckpt_util.cc:337] Inserting empty buffer to TfLite.


# Testing