## Configuration for fine-tuning and vLLM inference

In [1]:
import os
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()

hf_token = os.getenv('HF_TOKEN')
login(token=hf_token) # or login by terminal `huggingface-cli login`

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
from vllm import LLM, SamplingParams
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

###-- Critical Environment Memo --###
# CUDA: 12.9
# Driver version: 575.57.08
# torch version: 2.7.1 + cu128
###-------------------------------###

# Load the model
llm = LLM(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    tokenizer_mode="mistral",
    dtype="bfloat16",
    gpu_memory_utilization=0.90,
    max_model_len=2048,
    max_num_seqs=64,
    tensor_parallel_size=2,
)

# Define sampling parameters
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.95,
    max_tokens=512
)

INFO 06-17 17:55:09 [config.py:823] This model supports multiple tasks: {'classify', 'generate', 'score', 'embed', 'reward'}. Defaulting to 'generate'.
INFO 06-17 17:55:09 [config.py:1946] Defaulting to use mp for distributed inference
INFO 06-17 17:55:09 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 06-17 17:55:10 [core.py:455] Waiting for init message from front-end.
INFO 06-17 17:55:10 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='mistralai/Mistral-7B-Instruct-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.3', skip_tokenizer_init=False, tokenizer_mode=mistral, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:13 [weight_utils.py:308] Time spent downloading weights for mistralai/Mistral-7B-Instruct-v0.3: 0.746532 seconds
[1;36m(VllmWorker rank=0 pid=282554)[0;0m INFO 06-17 17:55:14 [default_loader.py:272] Loading weights took 2.07 seconds
[1;36m(VllmWorker rank=0 pid=282554)[0;0m INFO 06-17 17:55:15 [gpu_model_runner.py:1624] Model loading took 6.7584 GiB and 3.347486 seconds
[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:16 [default_loader.py:272] Loading weights took 2.26 seconds
[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:16 [gpu_model_runner.py:1624] Model loading took 6.7584 GiB and 4.563219 seconds
[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:20 [backends.py:462] Using cache directory: /home/ttsai/.cache/vllm/torch_compile_cache/49088ecba2/rank_1_0 for vLLM's torch.compile
[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:20 [backends.py:472] Dynamo bytecode transf

[1;36m(VllmWorker rank=1 pid=282555)[0;0m [rank1]:W0617 17:55:20.751000 282555 torch/_inductor/utils.py:1250] [0/0] Not enough SMs to use max_autotune_gemm mode
[1;36m(VllmWorker rank=0 pid=282554)[0;0m [rank0]:W0617 17:55:20.827000 282554 torch/_inductor/utils.py:1250] [0/0] Not enough SMs to use max_autotune_gemm mode


[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:21 [backends.py:161] Cache the graph of shape None for later use
[1;36m(VllmWorker rank=0 pid=282554)[0;0m INFO 06-17 17:55:21 [backends.py:161] Cache the graph of shape None for later use
[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:33 [backends.py:173] Compiling a graph for general shape takes 12.60 s
[1;36m(VllmWorker rank=0 pid=282554)[0;0m INFO 06-17 17:55:33 [backends.py:173] Compiling a graph for general shape takes 12.54 s
[1;36m(VllmWorker rank=0 pid=282554)[0;0m INFO 06-17 17:55:45 [monitor.py:34] torch.compile takes 16.07 s in total
[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:45 [monitor.py:34] torch.compile takes 16.09 s in total
[1;36m(VllmWorker rank=1 pid=282555)[0;0m INFO 06-17 17:55:46 [gpu_worker.py:227] Available KV cache memory: 6.55 GiB
[1;36m(VllmWorker rank=0 pid=282554)[0;0m INFO 06-17 17:55:46 [gpu_worker.py:227] Available KV cache memory: 6.55 GiB
INFO 06-17

In [4]:
# Genearate with the LoRA adapter
outputs = llm.generate(
    ["Write a financial analysis of Tesla."],
    sampling_params
)

# Print the generated text
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text}")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Generated text: 

Tesla, Inc. (TSLA) is an American electric vehicle (EV) and clean energy company that has disrupted the traditional automotive industry with its innovative technology and commitment to sustainability. As of 2021, Tesla is the world's most valuable automaker by market capitalization, and its stock has been notorious for its volatility.

Financial Analysis:

1. Revenue: In 2020, Tesla reported total revenues of $31.5 billion, an increase of 36% compared to 2019. The growth was driven by a 48% increase in vehicle deliveries to 509,738 units, as well as a 45% increase in regulatory credits to $1.2 billion.

2. Gross Margin: Tesla's gross margin improved significantly in 2020, reaching 27.3%, compared to 21.7% in 2019. The improvement was due to a higher mix of Model 3 and Model Y sales, which have higher margins, as well as cost-cutting measures and economies of


In [5]:
# Clean up model if necessary
import gc
import torch

# If using distributed parallelism, import destroy functions
from vllm.distributed.parallel_state import (
    destroy_model_parallel, destroy_distributed_environment
)

# Delete model parallel/distributed environments
destroy_model_parallel()
destroy_distributed_environment()

# Delete the LLM object
del llm

# Clean up Python and GPU memory
gc.collect()
torch.cuda.empty_cache()