# Testing for LLM APIs

Black-box
- gpt 3.5
- gemini 2.5 Flash
- Claude Sonnet 4.5

White-box
- Llama 3.1 8B
- Vicuna 13B
- DeepSeek 3.1
- Mistral 7B


# Black-Box

## GPT 3.5 Turbo

In [None]:
import json

# Read config
config_path = "../../model_configs/gpt3.5_config.json"

# Read JSON
with open(config_path, "r", encoding="utf-8") as f:
    config = json.load(f)

print(config.keys())   # Check Keys

dict_keys(['model_info', 'api_key_info', 'params'])


In [None]:
import sys
sys.path.append('/mnt/ssd/TSF/grow')

import time
from src.utils import load_json
from src.models import GPT 

CFG = "../../model_configs/gpt3.5_config.json"

if __name__ == "__main__":
    cfg = load_json(CFG)
    llm = GPT(cfg)

    prompt = "What is 1+2? Just answer with an number."
    t0 = time.time()
    out = llm.query(prompt)
    dt = time.time() - t0

    print("=== PROMPT ===")
    print(prompt)
    print("\n=== OUTPUT ===")
    print(out)
    print(f"\n(latency: {dt:.2f}s)")


  from .autonotebook import tqdm as notebook_tqdm


/mnt/ssd/TSF/grow/src/models
=== PROMPT ===
What is 1+2? Just answer with an number.

=== OUTPUT ===
3

(latency: 1.18s)


In [None]:
import sys
sys.path.append('/mnt/ssd/TSF/grow')

import time
from src.utils import load_json
from src.models import GPT5

CFG = "../../model_configs/gpt5_config.json"

if __name__ == "__main__":
    cfg = load_json(CFG)
    llm = GPT5(cfg)

    prompt = "What is 1+2? Just answer with an number."
    t0 = time.time()
    out = llm.query(prompt)
    dt = time.time() - t0

    print("=== PROMPT ===")
    print(prompt)
    print("\n=== OUTPUT ===")
    print(out)
    print(f"\n(latency: {dt:.2f}s)")


=== PROMPT ===
What is 1+2? Just answer with an number.

=== OUTPUT ===
3

(latency: 2.86s)


## Gemini 2.5 Flash

In [None]:
import sys, time
sys.path.append('/mnt/ssd/TSF/grow') 

from src.utils import load_json
from src.models import create_model 

CFG = "../../model_configs/gemini_config.json"

cfg = load_json(CFG)
llm = create_model(cfg)

prompt = "What is 1+2? Just answer with a number."
t0 = time.time()
out = llm.query(prompt)
dt = time.time() - t0

print("=== PROMPT ===\n", prompt)
print("\n=== OUTPUT ===\n", out)
print(f"\n(latency: {dt:.2f}s)")

  from .autonotebook import tqdm as notebook_tqdm


/mnt/ssd/TSF/grow/src/models
=== PROMPT ===
 What is 1+2? Just answer with a number.

=== OUTPUT ===
 3

(latency: 1.25s)


## Claude 3.7 Sonnet

In [None]:
import sys, time
sys.path.append('/mnt/ssd/TSF/grow')

from src.utils import load_json
from src.models import create_model 

CFG = "../../model_configs/claude_config.json"

cfg = load_json(CFG)
llm = create_model(cfg) 

prompt = "What is 1+2? Just answer with a number."
t0 = time.time()
out = llm.query(prompt)
dt = time.time() - t0

print("=== PROMPT ===\n", prompt)
print("\n=== OUTPUT ===\n", out)
print(f"\n(latency: {dt:.2f}s)")

  from .autonotebook import tqdm as notebook_tqdm


/mnt/ssd/TSF/grow/src/models
=== PROMPT ===
 What is 1+2? Just answer with a number.

=== OUTPUT ===
 3

(latency: 1.67s)


# White Box

## Llama

In [None]:
import sys, time
sys.path.append('/mnt/ssd/TSF/grow')  
from src.utils import load_json
from src.models import create_model   
CFG = "../../model_configs/llama_config.json"

cfg = load_json(CFG)
llm = create_model(cfg)  

prompt = "What is 1+2? Just answer with a number."
t0 = time.time()
out = llm.query(prompt)
dt = time.time() - t0

print("=== PROMPT ===\n", prompt)
print("\n=== OUTPUT ===\n", out)
print(f"\n(latency: {dt:.2f}s)")

  from .autonotebook import tqdm as notebook_tqdm


/mnt/ssd/TSF/grow/src/models


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


=== PROMPT ===
 What is 1+2? Just answer with a number.

=== OUTPUT ===
 3

(latency: 0.95s)


## Vicuna 13B v 1.5

In [None]:
import sys, time
sys.path.append('/mnt/ssd/TSF/grow') 

from src.utils import load_json
from src.models import create_model  

CFG = "../../model_configs/vicuna_config.json"

cfg = load_json(CFG)
llm = create_model(cfg)  

prompt = "What is 1+2? Just answer with a number."
t0 = time.time()
out = llm.query(prompt)
dt = time.time() - t0

print("=== PROMPT ===\n", prompt)
print("\n=== OUTPUT ===\n", out)
print(f"\n(latency: {dt:.2f}s)")

  from .autonotebook import tqdm as notebook_tqdm


/mnt/ssd/TSF/grow/src/models


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 22.17it/s]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


=== PROMPT ===
 What is 1+2? Just answer with a number.

=== OUTPUT ===
 1+2 is equal to 3.

(latency: 1.38s)


## Mistral

In [None]:
import sys, time
sys.path.append('/mnt/ssd/TSF/grow')  

from src.utils import load_json
from src.models import create_model  

CFG = "../../model_configs/mistral_config.json"

cfg = load_json(CFG)
llm = create_model(cfg)  

prompt = "What is 1+2? Just answer with a number."
t0 = time.time()
out = llm.query(prompt)
dt = time.time() - t0

print("=== PROMPT ===\n", prompt)
print("\n=== OUTPUT ===\n", out)
print(f"\n(latency: {dt:.2f}s)")

  from .autonotebook import tqdm as notebook_tqdm


/mnt/ssd/TSF/grow/src/models


Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.17s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  2.79it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


=== PROMPT ===
 What is 1+2? Just answer with a number.

=== OUTPUT ===
 3

(latency: 6.72s)
