In [2]:
import torch
print(torch.cuda.is_available())

True


In [7]:
import llama_cpp

llm = llama_cpp.Llama(
    model_path="/home/ubuntu/ai-engineering/models/qwen2.5-7b-instruct-q5_k_m-00001-of-00002.gguf",
    n_gpu_layers=-1,        # to use gpu
    n_ctx=1024,           
    verbose=False,
)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [8]:
def chat_completion(prompt: str) -> str:
 system_prompt = "Answer in two concise sentences."
 generation_params = {
  "temperature": 0.7,
  "top_p": 0.9,
  "stop": ["<|im_end|>", "<|endoftext|>"],
  "max_tokens": 2048,
 }
 
 answer = llm.create_chat_completion(
  messages=[
   {"role": "system", "content": system_prompt},
   {"role": "user", "content": prompt}
  ],
  **generation_params
 )
 
 return answer["choices"][0]["message"]["content"]

In [9]:
print(chat_completion(prompt="What is coffee?"))
print(chat_completion(prompt="Is it good for your health?"))

Coffee is a brewed beverage prepared from roasted coffee beans, which are the seeds of berries from the Coffea plant.
It depends on the context; generally, a balanced diet and regular exercise are good for health. Specific health impacts vary based on individual circumstances and habits.


In [13]:
CONTEXT_WINDOW        = 4096   # Model context length in tokens
MAX_GENERATION_TOKENS = 1024   # Reserved for the model's reply
SAFETY_MARGIN         = 32     # Stop tokens, unknowns, wiggle room
K_TURNS               = 10     # Max two way conversations kept in memory
SYSTEM_PROMPT         = "Answer in two concise sentences."

PROMPT_BUDGET = CONTEXT_WINDOW - MAX_GENERATION_TOKENS - SAFETY_MARGIN

In [15]:
import llama_cpp
from collections import deque 

llm = llama_cpp.Llama(
    model_path="/home/ubuntu/ai-engineering/models/qwen2.5-7b-instruct-q5_k_m-00001-of-00002.gguf",
    n_ctx=CONTEXT_WINDOW,
    n_gpu_layers=-1,
    verbose=False,
)

llama_context: n_ctx_per_seq (4096) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [16]:
def n_tokens(text: str) -> int:
 """Return the exact token count as the model sees it."""
 return len(llm.tokenize(text.encode(), special=True))

In [17]:
def make_msg(role: str, content: str) -> dict:
 return {
  "role": role,
  "content": content,
  "n_tokens": n_tokens(content),
 }

In [19]:
history = deque(maxlen=K_TURNS * 2)
print(history)

deque([], maxlen=20)


In [21]:
def build_window(current_user_msg: dict) -> list[dict]:
 """
 Assemble [system] + slice(history) + [current user] <= PROMPT_BUDGET.
 Returns the list ready for the model.
 """
 used = n_tokens(SYSTEM_PROMPT) + current_user_msg["n_tokens"]
 window = [make_msg("system", SYSTEM_PROMPT)]
 
 # Newest to oldest it will skip latest user turn
 for msg in reversed(list(history)[:-1]):
  if used + msg["n_tokens"] <= PROMPT_BUDGET:
   window.insert(1, msg)   
   used += msg["n_tokens"]
  else:
   break                    # Window is full
 
 window.append(current_user_msg)
 return window

In [22]:
GEN_PARAMS = dict(
 max_tokens=MAX_GENERATION_TOKENS,
 temperature=0.7,
 top_p=0.9,
 stop=["<|im_end|>", "<|endoftext|>"],
)

def answer(user_text: str) -> str:
 # 1. Log incoming turn
 user_msg = make_msg("user", user_text)
 history.append(user_msg)
 
 # 2. Craft prompt window
 prompt_window = build_window(user_msg)
 
 # 3. Ask the model
 reply = llm.create_chat_completion(
  messages=prompt_window,
  **GEN_PARAMS,
 )["choices"][0]["message"]["content"]
 
 # 4. Store assistant turn
 history.append(make_msg("assistant", reply))
 return reply

In [31]:
query = "My name is Ali, I study software engineering at FAST-NUCES."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")


You study software engineering, and you are at FAST-NUCES.
No of Tokens: 15


In [32]:
query = "Where do I study and What do I study"
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

You study software engineering, and you are at FAST-NUCES.
No of Tokens: 9


In [33]:
query = "My name is Ali, I study software engineering at FAST-NUCES."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "I am 19 years old and I graduated from AWS re/Start program."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "What is my age and which program did I graduate from"
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "I have experience with cloud computing and servers."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "What technologies do I have experience in"
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "I am currently working on a Java-based Transport Management System."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "Which programming language am I using for my transport system project"
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "I am building a PDF management system using AWS S3 and FastAPI."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "Which cloud service am I using for file storage"
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "I have a powerful PC with an RTX 4090 GPU for AI model deployment."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "Which GPU do I use for running AI models locally"
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "What kind of mentorship program am I designing and for whom"
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "I like creating faceless finance and tech videos using T4 GPU servers."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "Where do I study and What do I study"
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")

query = "I am designing a 12-week AWS mentorship program for cloud technicians."
print(answer(query))
print(f"No of Tokens: {n_tokens(query)}")


You study software engineering, and you are at FAST-NUCES.
No of Tokens: 15
You are 19 years old and graduated from the AWS re/Start program.
No of Tokens: 17
You are 19 years old and graduated from the AWS re/Start program.
No of Tokens: 11
You have experience with cloud computing and servers.
No of Tokens: 9
You have experience with cloud computing and servers.
No of Tokens: 7
You have experience with Java and are currently working on a Transport Management System.
No of Tokens: 12
You are using Java for your transport system project.
No of Tokens: 11
You are building a PDF management system using AWS S3 and FastAPI.
No of Tokens: 15
You are using AWS S3 for file storage.
No of Tokens: 9
You have a powerful PC with an RTX 4090 GPU for AI model deployment.
No of Tokens: 20
You use an RTX 4090 GPU for running AI models locally.
No of Tokens: 10
You are designing a mentorship program for individuals interested in AI and machine learning, likely focusing on practical applications and loc