# Load a Trained Text-based Small Language Model

In [2]:
import os
import torch
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline)
from datasets import load_dataset
import pandas as pd
# from peft import AutoPeftModelForCausalLM, PeftModel
import re
from tqdm import tqdm
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

print(f"Transformers Version: {transformers.__version__}")
print(f"Torch Version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"{torch.cuda.device_count()} CUDA ({torch.version.cuda}) device available")
else:
    print("No CUDA device available")

2025-03-29 14:13:29,965	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


Transformers Version: 4.47.1
Torch Version: 2.4.0+cu121
4 CUDA (12.1) device available


In [3]:
# lora_adapters_dir = "/home/azureuser/localfiles/models/frosty_apple_5d9xqfl2lw"
lora_adapters_dir = "/home/azureuser/localfiles/models/grpo"
# merged_model_folder = "/home/azureuser/localfiles/models/frosty_apple_5d9xqfl2lw_merged"
merged_model_folder = '/mnt/compatibility_checkpoints/actor/global_step_224'
output_string = 'compatibility_text_based'
sft_test_file = f"{output_string}_sft_test_2894.jsonl"
data_dir = "/home/azureuser/localfiles/data/polyvore_cp"

save_model = False
use_lora_adapter = False
base_model = "microsoft/Phi-3.5-mini-instruct"
# base_model = "microsoft/Phi-3-mini-128k-instruct"

In [4]:
if save_model:

    # Load the base model
    # Determine if GPU is available
    if torch.cuda.is_available():
        device_map = 'cuda'
        torch_dtype = torch.float16
    else:
        device_map = 'cpu'
        torch_dtype = torch.bfloat16

    model_kwargs = {
        "use_cache": False,
        "trust_remote_code": True,
        "attn_implementation": "flash_attention_2",
        "torch_dtype": torch.bfloat16,
        "device_map": "auto",
    }
    base_model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs)
    # tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(lora_adapters_dir)

    model = PeftModel.from_pretrained(base_model, lora_adapters_dir)
    model = model.merge_and_unload()

    model_size = sum(t.numel() for t in model.parameters())
    print(f"Merged Phi-3 model size: {model_size/1000**2:.1f}M parameters")

    # saving merged model for vLLM inference
    model.save_pretrained(merged_model_folder)
    tokenizer.save_pretrained(merged_model_folder)

In [5]:
if use_lora_adapter:
    print(f"Loading base model: {base_model}")
    vllm_model = LLM(model=base_model, enable_lora=True)
else:
    print(f"Loading merged/full model from {merged_model_folder}")
    vllm_model = LLM(model=merged_model_folder)


Loading merged/full model from /mnt/compatibility_checkpoints/actor/global_step_224
INFO 03-29 14:13:47 config.py:1670] Downcasting torch.float32 to torch.float16.


INFO 03-29 14:13:51 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='/mnt/compatibility_checkpoints/actor/global_step_224', speculative_config=None, tokenizer='/mnt/compatibility_checkpoints/actor/global_step_224', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/mnt/compatibility_checkpoints/actor/global_step_224, use_v2_block_ma

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 03-29 14:14:06 model_runner.py:1060] Starting to load model /mnt/compatibility_checkpoints/actor/global_step_224...
INFO 03-29 14:14:06 selector.py:247] Cannot use FlashAttention-2 backend due to sliding window.
INFO 03-29 14:14:06 selector.py:115] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.17it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  1.94it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.79it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.86it/s]



INFO 03-29 14:14:09 model_runner.py:1071] Loading model weights took 7.1659 GB
INFO 03-29 14:14:15 gpu_executor.py:122] # GPU blocks: 8999, # CPU blocks: 682
INFO 03-29 14:14:15 gpu_executor.py:126] Maximum concurrency for 131072 tokens per request: 1.10x
INFO 03-29 14:14:17 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-29 14:14:17 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 03-29 14:14:30 model_runner.py:1530] Graph capturing finished in 12 secs.


In [6]:
def vllm_inference(messages, vllm_model, **kwargs):
    max_new_tokens = kwargs.get('max_new_tokens', 512)
    temperature = kwargs.get('temperature', 0.0)
    adapter_path = kwargs.get('adapter_path', None)

    sampling_params = SamplingParams(max_tokens=max_new_tokens, temperature=temperature)

    if type(messages) is not list:
        messages = [messages]
    if adapter_path is not None:
        # https://docs.vllm.ai/en/latest/features/lora.html
        outputs = vllm_model.chat(messages, sampling_params,
                                  lora_request=LoRARequest("adapter", 1, lora_path=adapter_path))
    else:
        outputs = vllm_model.chat(messages, sampling_params)
    results = [o.outputs[0].text for o in outputs]
    return results

# Test
# sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# prompts = ["Hello, my name is", "The future of AI is"]
# outputs = vllm_model.generate(prompts, sampling_params)
# for output in outputs:
#     print(f"Prompt: {output.prompt}, Generated text: {output.outputs[0].text}")

def inference(messages, model, tokenizer, **kwargs):
    # messages = [
    #     {"role": "system", "content": "You are a helpful AI assistant."},
    #     {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    #     {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    #     {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
    # ]
    # device = torch.device('cuda:0')
    # model.to(device)
    max_new_tokens = kwargs.get('max_new_tokens', 512)
    temperature = kwargs.get('temperature', 0.0)
    # https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/base.py
    # vi ~/anaconda3/envs/transformer-4-44-2-v2/lib/python3.10/site-packages/transformers/pipelines/base.py
    # Line # 975, commented, Abir
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

    generation_args = {
        "max_new_tokens": max_new_tokens,
        "return_full_text": False,
        "temperature": temperature,
        "do_sample": False,
    }
    if type(messages) is not list:
        messages = [messages]
    output = pipe(messages, **generation_args)
    return output[0]['generated_text']


In [7]:
data_files = {"test": os.path.join(data_dir, sft_test_file)}
test_dataset = load_dataset("json", data_files=data_files)
test_dataset

Generating test split: 2894 examples [00:00, 55499.85 examples/s]


DatasetDict({
    test: Dataset({
        features: ['item_ids_original', 'item_ids_mapped', 'split', 'num_items', 'answer', 'messages', 'num_tokens'],
        num_rows: 2894
    })
})

## Batch Inference (vLLM)

In [8]:
import pandas as pd

# index = 0
# example = test_dataset['test'][index]
# res = inference(example['messages'][0], model, tokenizer)
# we need to pass a list of lists, a singe list will be regarded as a single example
prompts = [[example['messages'][0]] for example in test_dataset['test']]
targets = [example['messages'][1]['content'] for example in test_dataset['test']]
print(f"Processing {len(prompts)} prompts ...")

if use_lora_adapter:
    results = vllm_inference(prompts, vllm_model=vllm_model, adapter_path=lora_adapters_dir)
else:
    results = vllm_inference(prompts, vllm_model=vllm_model)
# sampling_params = SamplingParams(max_tokens=256, temperature=0.0)
# outputs = vllm_model.chat(prompts, sampling_params)
# results = [o.outputs[0].text for o in outputs]

Processing 2894 prompts ...


Processed prompts:   6%|▋         | 188/2894 [00:12<02:07, 21.24it/s, est. speed input: 9574.07 toks/s, output: 35.37 toks/s]



Processed prompts: 100%|██████████| 2894/2894 [03:04<00:00, 15.68it/s, est. speed input: 11322.40 toks/s, output: 1157.24 toks/s]


## GPT Judge

In [None]:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI

token_provider = get_bearer_token_provider(
    DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"
)
client = AzureOpenAI(
    api_version="2024-02-15-preview",
    azure_endpoint="https://abir-openai.openai.azure.com/",
    azure_ad_token_provider=token_provider
)

def call_gpt_judge(client, **kwargs):
    """Use GPT to evaluate the model's output against the target."""
    generator_model = kwargs.get("generator_model", None)
    output_length = kwargs.get("output_length", 32)
    temperature = kwargs.get("temperature", 0.0)
    system_query = kwargs.get("system_query", None)
    eval_query = kwargs.get("eval_query", None)

    payload = [{"role":"system", "content": system_query},
                    {"role":"user", "content": eval_query}]

    response = client.chat.completions.create(
                model=generator_model,  # change from engine to model
                messages=payload,
                temperature=temperature,
                max_tokens=output_length,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None)
    return response.choices[0].message.content

system_query_evaluator = """You are a helpful AI assistant that evaluates whether the generated response means `compatible` or `incompatible`.
The text may not say `compatible` or `incompatible` explicitly, but you should infer it based on the content of the text. It may say in terms
of whether the products can be worn together or not, or some other ways that imply compatibility or incompatibility.

Return your answer with a single word: `compatible` or `incompatible`.
Do not provide any additional explanation or text.
"""

# Example of how to call the GPT judge
def return_judge_response(results):
    """Return the GPT judge response for each result."""
    gpt_judge_responses = []
    for i in tqdm(range(len(results))):
        eval_query = f"Evaluate the following generated text for compatibility: {results[i]}"
        try:
            response = call_gpt_judge(client,
                                      generator_model="gpt-4o",
                                      output_length=5,
                                      temperature=0.0,
                                      system_query=system_query_evaluator,
                                      eval_query=eval_query)
            gpt_judge_responses.append(response.strip())
        except Exception as e:
            print(f"Error calling GPT judge: {e}")
            gpt_judge_responses.append("error")
    return gpt_judge_responses


In [20]:
gpt_judge_outputs = return_judge_response(results)
# Print the first 5 outputs for verification
for i in range(5):
    print(f"Generated Text: {results[i]}")
    print(f"GPT Judge Response: {gpt_judge_outputs[i]}")
    print("-" * 50)

Generated Text:  incompatible
GPT Judge Response: incompatible
--------------------------------------------------
Generated Text:  compatible
GPT Judge Response: compatible
--------------------------------------------------
Generated Text:  incompatible
GPT Judge Response: incompatible
--------------------------------------------------
Generated Text:  incompatible. The products in this outfit cannot be worn together due to their contrasting styles, colors, and levels of formality. The casual and bold floral bucket bag clashes with the elegant and glamorous gold tall boots, creating a mismatch in overall look and feel. Additionally, the bold and bold red A-line skirts further add to the clash, making it difficult to create a cohesive and harmonious ensemble. Each item is designed for different occasions and seasons, further highlighting their incompatibility when combined.
GPT Judge Response: incompatible
--------------------------------------------------
Generated Text:  incompatible


In [21]:
df_grpo = pd.DataFrame({'gt': targets, 'predicted': gpt_judge_outputs})
df_grpo['yhat'] = df_grpo['predicted'].apply(lambda x: x.split()[0].strip().strip('.').lower())
df_grpo['gt'] = df_grpo['gt'].apply(lambda x: x.strip().strip('.').lower())
df_grpo['acc'] = df_grpo.apply(lambda row: row['gt'].lower() == row['yhat'].lower(), axis=1)
print(f"Accuracy: {df_grpo['acc'].mean()*100:.2f}")


Accuracy: 69.52


In [22]:
# df_grpo['yhat'].value_counts()
pd.crosstab(df_grpo['gt'], df_grpo['yhat'], rownames=['y'], colnames=['yhat'])

yhat,compatible,incompatible
y,Unnamed: 1_level_1,Unnamed: 2_level_1
compatible,817,496
incompatible,386,1195


In [13]:
def extract_solution(solution_str):
    """Extract the equation from the solution string."""
    answer_pattern = r'<answer>(.*?)</answer>'
    match = re.finditer(answer_pattern, solution_str)
    matches = list(match)
    if matches:
        final_answer = matches[-1].group(1).strip()
    else:
        final_answer = None
    return final_answer


In [15]:
cot_sft_test_file = f"{output_string}_cot_sft_test_2894.jsonl"

data_files_cot = {"test": os.path.join(data_dir, cot_sft_test_file)}
test_dataset_cot = load_dataset("json", data_files=data_files_cot)
print(test_dataset_cot)

prompts_cot = [[example['messages'][0]] for example in test_dataset_cot['test']]
targets_cot = [extract_solution(example['messages'][1]['content']) for example in test_dataset_cot['test']]
print(f"Processing {len(prompts_cot)} prompts ...")

if use_lora_adapter:
    results_cot = vllm_inference(prompts, vllm_model=vllm_model, adapter_path=lora_adapters_dir)
else:
    results_cot = vllm_inference(prompts, vllm_model=vllm_model)
# sampling_params = SamplingParams(max_tokens=512, temperature=0.0)
# outputs_cot = vllm_model.chat(prompts, sampling_params)
# results_cot = [o.outputs[0].text for o in outputs_cot]

Generating test split: 2894 examples [00:00, 87388.88 examples/s]




DatasetDict({
    test: Dataset({
        features: ['item_ids_original', 'item_ids_mapped', 'split', 'num_items', 'messages', 'num_tokens'],
        num_rows: 2894
    })
})
Processing 2894 prompts ...


Processed prompts: 100%|██████████| 2894/2894 [03:05<00:00, 15.62it/s, est. speed input: 11280.54 toks/s, output: 1152.96 toks/s]


In [16]:
df_grpo_cot = pd.DataFrame({'gt': targets_cot, 'predicted': results_cot})
df_grpo_cot['yhat'] = df_grpo_cot['predicted'].apply(lambda x: x.split()[0].strip().strip('.').lower())
df_grpo_cot['gt'] = df_grpo_cot['gt'].apply(lambda x: x.strip().strip('.').lower())

df_grpo_cot['acc'] = df_grpo_cot.apply(lambda row: row['gt'].lower() == row['yhat'].lower(), axis=1)
print(f"Accuracy (CoT prompt): {df_grpo_cot['acc'].mean()*100:.2f}")

Accuracy (CoT prompt): 45.16


In [17]:
pd.crosstab(df_grpo_cot['gt'], df_grpo_cot['yhat'], rownames=['y'], colnames=['yhat'])

yhat,compatible,incompatible,the
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
compatible,480,453,380
incompatible,185,827,569


In [18]:
df_grpo['gt'].value_counts(True)

gt
incompatible    0.546303
compatible      0.453697
Name: proportion, dtype: float64

In [None]:
# model_dir = "/home/azureuser/localfiles/TinyZero/checkpoints/compatibility/cp-phi-3.5/actor/global_step_120"
# device_map = 'cuda'
# model_kwargs = dict(
#     use_cache=False,
#     trust_remote_code=True,
#     attn_implementation="flash_attention_2",  # loading the model with flash-attenstion support
#     torch_dtype=torch.bfloat16,
#     device_map=device_map,
# )
# model = AutoModelForCausalLM.from_pretrained(model_dir, **model_kwargs)