In [1]:
# check gpu memory
import torch

# Check if CUDA is available
print("CUDA available:", torch.cuda.is_available())

# Get the current GPU device
if torch.cuda.is_available():
    print("Current CUDA device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(0))
    
    # Get memory information
    print("\nMemory Usage:")
    print(f"Allocated: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")
    print(f"Cached: {torch.cuda.memory_reserved(0)/1024**3:.2f} GB")



: 

# Qwen1.5-1.8B-Chat Testing with Vllm

In [None]:
pip install vllm

Collecting vllm
  Downloading vllm-0.8.5.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (14 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting fastapi>=0.115.0 (from fastapi[standard]>=0.115.0->vllm)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm)
  Downloading lm_format_enforcer-0.10.11-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<0.8.0,>=0.7.9 (from vllm)
  Downloading llguidance-0.7.19-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting outlines==0.1.11 (from vllm)
  Downloading

In [None]:
from vllm import LLM, SamplingParams

INFO 05-08 20:32:39 [__init__.py:239] Automatically detected platform cuda.


In [None]:
import json
import time
import os
import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

# Function to create the legal system prompt
def create_legal_system_prompt():
    return """You are LegalAssistant, a professional legal advisor specializing in French and Moroccan law.

When answering questions:
- Base your answers strictly on the provided legal context
- Cite specific articles mentioned in the context
- If information is insufficient, state clearly "Based on the provided context, I don't have enough information to answer this question completely" rather than guessing
- Be concise and direct, avoiding unnecessary elaboration
- Use clear language that non-lawyers can understand
- Structure complex answers with numbered points for clarity
- Maintain a professional, helpful tone throughout

Your goal is to provide accurate legal information without hallucination or speculation."""

In [None]:
# Function to load the LLeQA dataset
def load_lleqa_dataset(data_dir="/content/drive/MyDrive/legal-rag-assistant/Notebooks/data/lleqa"):
    """Load the LLeQA dataset from JSON files."""
    questions_dev = json.load(open(os.path.join(data_dir, "questions_dev.json")))
    articles = json.load(open(os.path.join(data_dir, "articles.json")))

    print(f"Loaded {len(questions_dev)} dev questions")
    print(f"Loaded {len(articles)} legal articles")

    return {
        "dev": questions_dev,
        "articles": articles
    }

# Load the dataset
data = load_lleqa_dataset()

# Create article lookup dictionary
article_lookup = {article["id"]: article for article in data["articles"]}

Loaded 201 dev questions
Loaded 27942 legal articles


In [None]:
# Path to the fine-tuned model
MODEL_PATH = "/content/drive/MyDrive/legal-rag-assistant/FineTuned-Qwen2/qwen-legal-assistant/merged_16bit"

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

Loading tokenizer...


In [None]:
# Initialize the model with VRAM constraints for GTX 1650
llm = LLM(
    model="/content/drive/MyDrive/legal-rag-assistant/FineTuned-Qwen2/qwen-legal-assistant/merged_16bit",
    tensor_parallel_size=1,        # Use single GPU
    gpu_memory_utilization=0.85,   # Control VRAM usage
    max_model_len=8192,            # Reduced context length for GTX 1650
    trust_remote_code=True,        # Required for Qwen models
)

INFO 05-08 20:33:43 [config.py:717] This model supports multiple tasks: {'score', 'embed', 'classify', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 05-08 20:33:43 [llm_engine.py:240] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/content/drive/MyDrive/legal-rag-assistant/FineTuned-Qwen2/qwen-legal-assistant/merged_16bit', speculative_config=None, tokenizer='/content/drive/MyDrive/legal-rag-assistant/FineTuned-Qwen2/qwen-legal-assistant/merged_16bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityCon

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 05-08 20:34:02 [loader.py:458] Loading weights took 15.74 seconds
INFO 05-08 20:34:02 [model_runner.py:1140] Model loading took 3.4654 GiB and 15.991720 seconds
INFO 05-08 20:34:05 [worker.py:287] Memory profiling takes 2.23 seconds
INFO 05-08 20:34:05 [worker.py:287] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.85) = 12.53GiB
INFO 05-08 20:34:05 [worker.py:287] model weights take 3.47GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 1.42GiB; the rest of the memory reserved for KV Cache is 7.62GiB.
INFO 05-08 20:34:05 [executor_base.py:112] # cuda blocks: 2601, # CPU blocks: 1365
INFO 05-08 20:34:05 [executor_base.py:117] Maximum concurrency for 8192 tokens per request: 5.08x
INFO 05-08 20:34:10 [model_runner.py:1450] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. 

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 05-08 20:34:50 [model_runner.py:1592] Graph capturing finished in 41 secs, took 0.16 GiB
INFO 05-08 20:34:50 [llm_engine.py:437] init engine (profile, create kv cache, warmup model) took 48.20 seconds


In [None]:
# Function to test the model with streaming and timing
def test_model_with_vllm(question, context):
    """Generate a response using VLLM with streaming and timing"""
    system_prompt = create_legal_system_prompt()

    prompt = f"# Question: {question}\n\n# Relevant legal context:\n{context}\n\nPlease answer based only on this information."


    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]

    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )

    # Set sampling parameters
    sampling_params = SamplingParams(
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        max_tokens=512,
    )

    # Start timing
    start_time = time.time()
    start_generation = time.time()
    print("\nStarting generation...")

    # Generate with streaming
    output = ""
    first_token_time = None

    for output_obj in llm.generate(text, sampling_params=sampling_params):
        if first_token_time is None:
            first_token_time = time.time() - start_generation

        new_text = output_obj.outputs[0].text
        new_token = new_text[len(output):]
        output = new_text
        print(new_token, end="", flush=True)

    # Calculate metrics
    total_time = time.time() - start_time

    # Print timing stats
    print(f"\n\n==== Generation Stats ====")
    print(f"Time to first token: {first_token_time:.2f} seconds")
    print(f"Total generation time: {total_time:.2f} seconds")
    print(f"Approximate tokens generated: {len(output.split())}")
    print(f"Approximate tokens per second: {len(output.split())/total_time:.2f}")

    return output


In [None]:
# Select a question from the dev set (e.g., first question)
sample_question = data["dev"][10]

# Get relevant articles for the question
relevant_articles = []
for article_id in sample_question["article_ids"]:
    if article_id in article_lookup:
        article_text = article_lookup[article_id]["article"]
        article_ref = article_lookup[article_id].get("reference", f"Article ID: {article_id}")
        relevant_articles.append(f"{article_ref}\n{article_text}")

# Combine relevant article text (limit to first 2 articles to avoid context length issues)
context = "\n\n".join(relevant_articles[:2])

In [None]:
# Print question and context
print("\n===== TEST QUESTION =====")
print(sample_question["question"])

print("\n===== CONTEXT (EXCERPT) =====")
print(context[:500] + "..." if len(context) > 500 else context)

# Generate response with VLLM and timing
print("\n===== MODEL RESPONSE WITH VLLM STREAMING =====")
response = test_model_with_vllm(sample_question["question"], context)


===== TEST QUESTION =====
Que se passe-t-il s'il n'y a pas d'état des lieux à Bruxelles ?

===== CONTEXT (EXCERPT) =====
Art. 1730, Code civil (Titre VIII, Chapitre II, Section I)
§ 1. Les parties dressent impérativement un état des lieux détaillé contradictoirement et à frais communs. Cet état des lieux est dressé, soit au cours de la période où les locaux sont inoccupés, soit au cours du premier mois d'occupation. Il est annexé au contrat de bail écrit, au sens de l'article 1erbis du livre III, titre VIII, chapitre II, section 2 et sera également soumis à enregistrement.A défaut d'accord entre les parties, le ju...

===== MODEL RESPONSE WITH VLLM STREAMING =====

Starting generation...


Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

assistant
# Answer: Le propriétaire doit déposer une déclaration de réception de l’état des lieux, qui est à charge de la location. Si vous n'y faites pas la déclaration, le juge de paix peut vous demander d'en déposer une. Le propriétaire doit établir que : les lieux sont dans l'état que vous les avez reconnus, ou qu'ils ne sont pas encore mis à votre disposition (par exemple, il y a des dégâts ou des dégradations qui ont été conservées); les biens sont présents et fonctionnent correctement; les charges sont présentes et fonctionnent correctement. En cas de conflit, vous pouvez introduire une procédure en justice. Le juge de paix désigne un expert pour établir le état des lieux. Il peut établir des conclusions contradictoirement et à frais communs. Il peut également établir des conclusions contradictoirement et à frais partagés (ou à frais partagés). Il peut également établir des conclusions contradictoirement et à frais étrangers. Il peut également établir des conclusions contradicto