### Convert to Hugging Face to PEFT Format

In [None]:
from peft import LoraConfig, get_peft_model
import json
from safetensors.torch import save_file, load_file

# Load params.json and convert to PEFT format
with open('/home/ttsai/mulkooo/sj_Trading/experiment/checkpoints/checkpoint_000300/consolidated/params.json', 'r') as f:
    params = json.load(f)

adapter_config = {
    "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
    "bias": "none",
    "peft_type": "LORA",
    "task_type": "CAUSAL_LM",
    "r": params["lora"].get("rank", 32),
    "lora_alpha": params["lora"].get("alpha", 64),
    "lora_dropout": params["lora"].get("dropout", 0.1),
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
}

# Save as adapter_config.json
with open('adapter_config.json', 'w') as f:
    json.dump(adapter_config, f, indent=2)

# Copy converted config file to sj_Trading/experiment/checkpoints/checkpoint_0003000/consolidated

### Try to inference using converted LoRA config

In [None]:
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

llm = LLM(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    enable_lora=True,
    tokenizer_mode="mistral",
    dtype="bfloat16",
    gpu_memory_utilization=0.90,
    max_model_len=2048,
    max_num_seqs=64,
    tensor_parallel_size=2,
)

INFO 06-17 17:28:43 [config.py:823] This model supports multiple tasks: {'generate', 'reward', 'score', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 06-17 17:28:44 [config.py:1946] Defaulting to use mp for distributed inference
INFO 06-17 17:28:44 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 06-17 17:28:44 [core.py:455] Waiting for init message from front-end.
INFO 06-17 17:28:44 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='mistralai/Mistral-7B-Instruct-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.3', skip_tokenizer_init=False, tokenizer_mode=mistral, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=0 pid=274721)[0;0m INFO 06-17 17:28:48 [default_loader.py:272] Loading weights took 1.50 seconds
[1;36m(VllmWorker rank=0 pid=274721)[0;0m INFO 06-17 17:28:48 [punica_selector.py:19] Using PunicaWrapperGPU.
[1;36m(VllmWorker rank=0 pid=274721)[0;0m INFO 06-17 17:28:48 [gpu_model_runner.py:1624] Model loading took 6.8228 GiB and 2.438754 seconds
[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17 17:28:49 [default_loader.py:272] Loading weights took 1.98 seconds
[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17 17:28:49 [punica_selector.py:19] Using PunicaWrapperGPU.
[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17 17:28:49 [gpu_model_runner.py:1624] Model loading took 6.8228 GiB and 3.371904 seconds
[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17 17:28:55 [backends.py:462] Using cache directory: /home/ttsai/.cache/vllm/torch_compile_cache/022cfb20bf/rank_1_0 for vLLM's torch.compile
[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17

[1;36m(VllmWorker rank=1 pid=274722)[0;0m [rank1]:W0617 17:28:56.221000 274722 torch/_inductor/utils.py:1250] [0/0] Not enough SMs to use max_autotune_gemm mode
[1;36m(VllmWorker rank=0 pid=274721)[0;0m [rank0]:W0617 17:28:56.241000 274721 torch/_inductor/utils.py:1250] [0/0] Not enough SMs to use max_autotune_gemm mode


[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17 17:28:57 [backends.py:161] Cache the graph of shape None for later use
[1;36m(VllmWorker rank=0 pid=274721)[0;0m INFO 06-17 17:28:57 [backends.py:161] Cache the graph of shape None for later use
[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17 17:29:15 [backends.py:173] Compiling a graph for general shape takes 20.19 s
[1;36m(VllmWorker rank=0 pid=274721)[0;0m INFO 06-17 17:29:15 [backends.py:173] Compiling a graph for general shape takes 20.18 s
[1;36m(VllmWorker rank=0 pid=274721)[0;0m INFO 06-17 17:29:39 [monitor.py:34] torch.compile takes 25.74 s in total
[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17 17:29:39 [monitor.py:34] torch.compile takes 25.73 s in total
[1;36m(VllmWorker rank=0 pid=274721)[0;0m INFO 06-17 17:29:40 [gpu_worker.py:227] Available KV cache memory: 6.49 GiB
[1;36m(VllmWorker rank=1 pid=274722)[0;0m INFO 06-17 17:29:40 [gpu_worker.py:227] Available KV cache memory: 6.49 GiB
INFO 06-17

[1;36m(VllmWorker rank=1 pid=274722)[0;0m [1;36m(VllmWorker rank=0 pid=274721)[0;0m ERROR 06-17 17:30:29 [multiproc_executor.py:527] WorkerProc hit an exception.
ERROR 06-17 17:30:29 [multiproc_executor.py:527] WorkerProc hit an exception.
[1;36m(VllmWorker rank=0 pid=274721)[0;0m ERROR 06-17 17:30:29 [multiproc_executor.py:527] Traceback (most recent call last):
[1;36m(VllmWorker rank=1 pid=274722)[0;0m [1;36m(VllmWorker rank=0 pid=274721)[0;0m ERROR 06-17 17:30:29 [multiproc_executor.py:527] Traceback (most recent call last):
ERROR 06-17 17:30:29 [multiproc_executor.py:527]   File "/home/ttsai/mulkooo/sj_Trading/.venv/lib/python3.11/site-packages/vllm/v1/executor/multiproc_executor.py", line 522, in worker_busy_loop
[1;36m(VllmWorker rank=0 pid=274721)[0;0m [1;36m(VllmWorker rank=1 pid=274722)[0;0m ERROR 06-17 17:30:29 [multiproc_executor.py:527]     output = func(*args, **kwargs)
[1;36m(VllmWorker rank=0 pid=274721)[0;0m ERROR 06-17 17:30:29 [multiproc_executor.py:52

Process EngineCore_0:
Traceback (most recent call last):
  File "/home/ttsai/miniconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/ttsai/miniconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ttsai/mulkooo/sj_Trading/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 519, in run_engine_core
    raise e
  File "/home/ttsai/mulkooo/sj_Trading/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 508, in run_engine_core
    engine_core.run_busy_loop()
  File "/home/ttsai/mulkooo/sj_Trading/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 535, in run_busy_loop
    self._process_engine_step()
  File "/home/ttsai/mulkooo/sj_Trading/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 560, in _process_engine_step
    outputs, model_executed = self.step_fn()
                              ^^^^^^^^^^^^^^
  File "/h

In [20]:
# Define sampling parameters
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.95,
    max_tokens=256
)

# Create a LoRA request with the PEFT adapter
lora_request = LoRARequest(
    "financial_adapter", # Just human readable
    1,
    lora_path="/home/ttsai/mulkooo/sj_Trading/adapter"
)

In [21]:
# Genearate with the LoRA adapter
outputs = llm.generate(
    ["Write a financial analysis of Tesla."],
    sampling_params,
    lora_request=lora_request
)

# Print the generated text
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text}")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.

In [28]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "/home/ttsai/mulkooo/sj_Trading/adapter")

# Merge LoRA weights into base model
merged_model = model.merge_and_unload()

# Save merged model
merged_model.save_pretrained("/home/ttsai/mulkooo/sj_Trading/merge")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [39]:
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

llm = LLM(
    model="/home/ttsai/mulkooo/sj_Trading/merge",
    tokenizer_mode="mistral",
    dtype="bfloat16",
    gpu_memory_utilization=0.90,
    max_model_len=2048,
    max_num_seqs=64,
    tensor_parallel_size=2,
)

INFO 06-17 17:58:31 [config.py:823] This model supports multiple tasks: {'generate', 'reward', 'score', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 06-17 17:58:31 [config.py:3268] Downcasting torch.float32 to torch.bfloat16.
INFO 06-17 17:58:31 [config.py:1946] Defaulting to use mp for distributed inference
INFO 06-17 17:58:31 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 06-17 17:58:33 [__init__.py:244] Automatically detected platform cuda.
INFO 06-17 17:58:35 [core.py:455] Waiting for init message from front-end.
INFO 06-17 17:58:35 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='/home/ttsai/mulkooo/sj_Trading/merge', speculative_config=None, tokenizer='/home/ttsai/mulkooo/sj_Trading/merge', skip_tokenizer_init=False, tokenizer_mode=mistral, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUT

Loading safetensors checkpoint shards:   0% Completed | 0/6 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  17% Completed | 1/6 [00:00<00:02,  1.75it/s]
Loading safetensors checkpoint shards:  33% Completed | 2/6 [00:01<00:02,  1.72it/s]
Loading safetensors checkpoint shards:  50% Completed | 3/6 [00:01<00:01,  1.72it/s]
Loading safetensors checkpoint shards:  67% Completed | 4/6 [00:02<00:01,  1.80it/s]
Loading safetensors checkpoint shards:  83% Completed | 5/6 [00:02<00:00,  1.74it/s]
Loading safetensors checkpoint shards: 100% Completed | 6/6 [00:03<00:00,  1.72it/s]
Loading safetensors checkpoint shards: 100% Completed | 6/6 [00:03<00:00,  1.74it/s]
[1;36m(VllmWorker rank=0 pid=284014)[0;0m 


[1;36m(VllmWorker rank=0 pid=284014)[0;0m INFO 06-17 17:58:42 [default_loader.py:272] Loading weights took 3.49 seconds
[1;36m(VllmWorker rank=0 pid=284014)[0;0m INFO 06-17 17:58:42 [gpu_model_runner.py:1624] Model loading took 6.7584 GiB and 3.569238 seconds
[1;36m(VllmWorker rank=1 pid=284015)[0;0m INFO 06-17 17:58:43 [default_loader.py:272] Loading weights took 3.92 seconds
[1;36m(VllmWorker rank=1 pid=284015)[0;0m INFO 06-17 17:58:43 [gpu_model_runner.py:1624] Model loading took 6.7584 GiB and 4.000864 seconds
[1;36m(VllmWorker rank=1 pid=284015)[0;0m INFO 06-17 17:58:46 [backends.py:462] Using cache directory: /home/ttsai/.cache/vllm/torch_compile_cache/34a6c50b67/rank_1_0 for vLLM's torch.compile
[1;36m(VllmWorker rank=1 pid=284015)[0;0m INFO 06-17 17:58:46 [backends.py:472] Dynamo bytecode transform time: 3.39 s
[1;36m(VllmWorker rank=0 pid=284014)[0;0m INFO 06-17 17:58:46 [backends.py:462] Using cache directory: /home/ttsai/.cache/vllm/torch_compile_cache/34a6c50b

In [38]:
# Define sampling parameters
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.95,
    max_tokens=1024
)

In [40]:
# Genearate with the LoRA adapter
outputs = llm.generate(
    ["Write a financial analysis of Tesla."],
    sampling_params
)

# Print the generated text
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text}")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Generated text: 

Tesla, Inc. (TSLA) is an American electric vehicle (EV) and clean energy company that has disrupted the traditional automotive industry with its innovative technology and commitment to sustainability. As of 2021, Tesla is the world's most valuable automaker by market capitalization, and its stock has been notorious for its volatility.

Financial Analysis:

1. Revenue: In 2020, Tesla reported total revenues of $31.5 billion, an increase of 36% compared to 2019. The growth was driven by a 48% increase in vehicle deliveries to 509,738 units, as well as a 45% increase in regulatory credits to $1.2 billion.

2. Gross Margin: Tesla's gross margin improved significantly in 2020, reaching 27.3%, compared to 21.7% in 2019. The improvement was due to a higher mix of Model 3 and Model Y sales, which have higher margins, as well as cost-cutting measures and economies of scale.

3. Operating Expenses: Total operating expenses increased by 24% in 2020, primarily due to higher resea

In [41]:
import gc
import torch

# If using distributed parallelism, import destroy functions
from vllm.distributed.parallel_state import (
    destroy_model_parallel, destroy_distributed_environment
)

# Delete model parallel/distributed environments
destroy_model_parallel()
destroy_distributed_environment()

# Delete the LLM object
del llm

# Clean up Python and GPU memory
gc.collect()
torch.cuda.empty_cache()
