In [None]:
"""
python3 -m pip install vllm
python3 -m pip install boto3

#curl https://dl.min.io/client/mc/release/linux-amd64/mc \
#  --create-dirs \
#  -o $HOME/minio-binaries/mc
#chmod +x $HOME/minio-binaries/mc
#/opt/app-root/src/minio-binaries/mc --help
"""

In [None]:
import os
import errno
from boto3 import client

os.environ["s3_host"] = "http://minio-api-http-model-serving-test.apps.psap.example.com"
os.environ["s3_access_key"] = "minio"
os.environ["s3_secret_access_key"] = "minio_1_2_3"
os.environ["s3_bucket"] = "models"
os.environ["model_name"] = "granite-3b-code-instruct/granite-3b-code-instruct"



In [None]:
def download_model_from_s3(model_name, destination_path):
    # Create S3 client
    s3_client = client(
        's3', endpoint_url=os.environ["s3_host"], aws_access_key_id=os.environ["s3_access_key"],
        aws_secret_access_key=os.environ["s3_secret_access_key"], verify=False
    )

    # List all objects in the folder
    objects = s3_client.list_objects(Bucket=os.environ["s3_bucket"], Prefix=os.environ["model_name"])

    # Download each object in the folder
    for obj in objects.get('Contents', []):
        file_name = obj['Key']
        local_file_name = os.path.join(destination_path, file_name.replace(model_name, '')[1:])
        if not os.path.exists(os.path.dirname(local_file_name)):
            try:
                os.makedirs(os.path.dirname(local_file_name))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    print("Error downloading model")
                    raise
        s3_client.download_file(os.environ["s3_bucket"], file_name, local_file_name)

    print('Model downloaded successfully from S3.')

def upload_model_to_s3(source_path, destination_prefix):
    s3_client = client(
        's3', endpoint_url=os.environ["s3_host"], aws_access_key_id=os.environ["s3_access_key"],
        aws_secret_access_key=os.environ["s3_secret_access_key"], verify=False
    )
    print(source_path)
    print(os.walk(source_path))
    for root, dirs, files in os.walk(source_path):
        for file in files:
            print(f"Uploading: '{file}'")
            file_path = os.path.join(root, file)
            s3_client.upload_file(file_path, os.environ["s3_bucket"], f"{destination_prefix}/{file}")

    print(f"Quantized model uploaded to MinIO bucket as '{destination_prefix}'.")


In [None]:
model=os.environ["model_name"]
path=os.environ["s3_bucket"]+'/'+os.environ["model_name"]
download_model_from_s3(model, path)

In [None]:
def quantize_gpu_model(model_path:str, compress_model_path: str, ds: str):
    # Quantizing an LLM
    from transformers import AutoTokenizer
    from datasets import load_dataset

    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

    MAX_SEQ_LEN = 512
    NUM_EXAMPLES = 512

    def preprocess(example):
        return {"text": tokenizer.apply_chat_template(example["messages"],
                                                      tokenize=False)}

    print("Loading the dataset and tokenizers")
    dataset = load_dataset(ds, split="train_sft")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    ds = dataset.shuffle().select(range(NUM_EXAMPLES))
    ds = ds.map(preprocess)

    examples = [
        tokenizer(
            example["text"], padding=False, max_length=MAX_SEQ_LEN,
            truncation=True,
        ) for example in ds
    ]

    print("Loaded the dataset and tokenizers")
    print("Starting the quantization")

    # Apply GPTQ
    quantize_config = BaseQuantizeConfig(
        bits=8,                         # Only support 4 bit
        group_size=-1,                 # Set to g=128 or -1 (for channelwise)
        desc_act=False,                 # Marlin does not support act_order=True
        model_file_base_name="model",   # Name of the model.safetensors when we call save_pretrained
    )
    print("Applying GPTQ for quantization")

    model = AutoGPTQForCausalLM.from_pretrained(
        model_path,
        quantize_config,
        device_map="auto")
    model.quantize(examples)

    gptq_save_dir = f"{model_path}-gptq"
    print(f"Saving gptq model to {gptq_save_dir}")
    model.save_pretrained(gptq_save_dir)
    tokenizer.save_pretrained(gptq_save_dir)

    # Convert to Marlin
    #print("Reloading in marlin format")
    #marlin_model = AutoGPTQForCausalLM.from_quantized(
    #    gptq_save_dir,
    #    use_marlin=True,
    #    device_map="auto")

    #print(f"Saving model in marlin format to {compress_model_path}")
    #marlin_model.save_pretrained(compress_model_path)
    #tokenizer.save_pretrained(compress_model_path)

    print("Quantization process completed")



In [None]:
model_path=os.environ["s3_bucket"]+'/'+os.environ["model_name"]

compress_model_path = model_path+"/compressed"
dataset_name = "HuggingFaceH4/ultrachat_200k"  # Replace with the name of your dataset

quantize_gpu_model(model_path, compress_model_path, dataset_name)

In [None]:
model=os.environ["model_name"]
bucket=os.environ["s3_bucket"]
path = f"{bucket}/{model}-gptq"
prefix = f"{model}-gptq"
upload_model_to_s3(path, prefix)