In [3]:
from generator import generate
from schemas import GenerateRequest, Model

import httpx
from httpx import AsyncClient

headers = {
    "Authorization": "Bearer token-abc123"
}

model = Model(
    model_name="gpt2",
    args={
        "max_tokens": 50,
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 50,})

prompt = "hello, how are you?"
gen_prompt = GenerateRequest(prompt=prompt)

async with AsyncClient() as client:
    result = await generate(gen_prompt, model=model, client=client, headers=headers)
    print(result)

model_id='gpt2' output="\n\nA: I'm good.\n\nQ: Did you get any good news or bad news?\n\nA: I didn't get any good news.\n\nQ: How did you get to the hospital?\n\nA:"


In [1]:
from fastapi import HTTPException
import yaml

with open("../config.yaml") as f:
    model_config = yaml.safe_load(f)["models"]

def get_model_url(model_id: str) -> str:
    if model_id not in model_config:
        raise HTTPException(status_code=404, detail="Model not found")
    return model_config[model_id]["url"]

In [2]:
model_config

{'gpt2': {'name': 'gpt2',
  'url': 'http://localhost:8001',
  'device': 'cuda',
  'priority': 1},
 'qwen': {'name': 'Qwen/Qwen2.5-0.5B-Instruct',
  'url': 'http://localhost:8002',
  'device': 'cpu',
  'VLLM_CPU_OMP_THREADS_BIND': 1,
  'VLLM_CPU_KVCACHE_SPACE': 1},
 'llama': {'name': 'meta-llama/Llama-3.2-1B-Instruct',
  'url': 'http://localhost:8003',
  'device': 'cpu',
  'VLLM_CPU_OMP_THREADS_BIND': 2,
  'VLLM_CPU_KVCACHE_SPACE': 1}}

In [10]:
a = {
  "models_health": [
    {
      "url": "http://localhost:8001",
      "model_id": "gpt2",
      "model_name": "gpt2",
      "status": "healthy"
    },
    {
      "url": "http://localhost:8002",
      "model_id": "qwen",
      "model_name": "Qwen/Qwen2.5-0.5B-Instruct",
      "status": "healthy"
    },
    {
      "url": "http://localhost:8003",
      "model_id": "llama",
      "model_name": "meta-llama/Llama-3.2-1B-Instruct",
      "status": "healthy"
    }
  ]
}

In [14]:
for model in a["models_health"]:
    print(model)
    break


{'url': 'http://localhost:8001', 'model_id': 'gpt2', 'model_name': 'gpt2', 'status': 'healthy'}


In [17]:
{"models_outputs":[{"model_name":"gpt2","prompt":"How to grow a tree?","output":"\n\nMakes a nice, clean, and comfortable tree.\n\nCan grow in the summer months.\n\nCan be harvested and cut down for sale.\n\nCan be harvested and cut down for sale.\n\nCan be used as a fertilizer and can even be used as an energy source.\n\nCan be used as a fertilizer and can even be used as an energy source.\n\nCan be used in a high-intensity sport such as basketball.\n\nCan be used"},{"model_name":"Qwen/Qwen2.5-0.5B-Instruct","prompt":"How to grow a tree?","output":" Trees are plants, and like all plants, they require water, sunlight, nutrients, warmth, air, etc. All of this is what we put in the soil.\nTo grow trees:\n- Start with seedling or young saplings from your local nursery\n- Water regularly (daily or twice daily)\n- Feed with composted manure or composted horse manure\n- Add fertilizer at flowering time using 20:1 liquid formula\n- Make sure your tree has access to lots of"},{"model_name":"meta-llama/Llama-3.2-1B-Instruct","prompt":"How to grow a tree?","output":" A step by step guide\n\nGrowing a tree can be a rewarding and rewarding experience. Here's a step-by-step guide to help you grow your very own tree.\n\n**Step 1: Choose the Right Tree**\n\n* **Select a tree species**: Look for trees that are suitable for your climate, soil type, and available space. Consider factors like growth rate, maintenance requirements, and disease resistance.\n* **Consider the mature size**: Choose a tree that will grow to a size that fits your available"}]}

{'models_outputs': [{'model_name': 'gpt2',
   'prompt': 'How to grow a tree?',
   'output': '\n\nMakes a nice, clean, and comfortable tree.\n\nCan grow in the summer months.\n\nCan be harvested and cut down for sale.\n\nCan be harvested and cut down for sale.\n\nCan be used as a fertilizer and can even be used as an energy source.\n\nCan be used as a fertilizer and can even be used as an energy source.\n\nCan be used in a high-intensity sport such as basketball.\n\nCan be used'},
  {'model_name': 'Qwen/Qwen2.5-0.5B-Instruct',
   'prompt': 'How to grow a tree?',
   'output': ' Trees are plants, and like all plants, they require water, sunlight, nutrients, warmth, air, etc. All of this is what we put in the soil.\nTo grow trees:\n- Start with seedling or young saplings from your local nursery\n- Water regularly (daily or twice daily)\n- Feed with composted manure or composted horse manure\n- Add fertilizer at flowering time using 20:1 liquid formula\n- Make sure your tree has access to l

In [1]:
import os
os.environ["GGML_VERBOSE"] = "1"

In [2]:
from llama_cpp import Llama

llm = Llama(
    model_path="/home/krasniuk-ai/model-rag-vote/Qwen_Qwen3-8B-Q4_K_M.gguf",
    n_ctx=1024,
    # 2500MB, mb can do even more,
    n_gpu_layers=14,
    offload_kqv=True,
)

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3050 Ti Laptop GPU, compute capability 8.6, VMM: yes
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3050 Ti Laptop GPU) - 2688 MiB free
llama_model_loader: loaded meta data with 32 key-value pairs and 399 tensors from /home/krasniuk-ai/model-rag-vote/Qwen_Qwen3-8B-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Qwen3 8B
llama_model_loader: - kv   3:                           general.basename str              = Qwen3
llama_model_loader: - k

In [44]:
from transformers import AutoTokenizer

# Load tokenizer from original Qwen
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", trust_remote_code=True)



Exception: data did not match any variant of untagged enum ModelWrapper at line 757479 column 3

In [97]:
from pydantic import BaseModel, Field
from typing import List, Dict, Any
import requests
from pydantic import BaseModel
from datetime import datetime
from abc import ABC, abstractmethod
from typing import Dict, Iterator, List, Optional, Union, Literal


class Message(BaseModel):
    name: str = "message"
    role: Literal["user", "assistant", "system"]
    content: str = Field(default_factory=str)

    def __repr__(self):
        return f"\n{self.content}\n"
    
class LlmMessage(Message):
    tool_calls: List[Dict] = Field(default_factory=list)

    def __repr__(self):
        return f"\n{self.content}\n"
    
    def model_dump(self, **kwargs):
        base = super().model_dump(**kwargs)
        base["formatted"] = {"role": self.role, "content": self.content}
        return base["formatted"]
    
    def __getitem__(self, key):
        return getattr(self, key)

    def __iter__(self):
        yield from self.model_dump().items()

class UserMessage(Message):
    user_id: str = Field(default_factory=str)

    def __repr__(self):
        return f"\n{self.content}\n"
    
    def model_dump(self, **kwargs):
        base = super().model_dump(**kwargs)
        base["formatted"] = {"role": self.role, "content": self.content}
        return base["formatted"]
    
    def __getitem__(self, key):
        return getattr(self, key)

    def __iter__(self):
        yield from self.model_dump().items()

class SystemMessage(Message):
    def __repr__(self):
        return f"\n{self.content}\n"
    
    def model_dump(self, **kwargs):
        base = super().model_dump(**kwargs)
        base["formatted"] = {"role": self.role, "content": self.content}
        return base["formatted"]
    
    def __getitem__(self, key):
        return getattr(self, key)

    def __iter__(self):
        yield from self.model_dump().items()

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

path = 'Alibaba-NLP/gte-base-en-v1.5'
device = torch.device('cuda')
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(
    path,
    trust_remote_code=True,
    unpad_inputs=True,
    use_memory_efficient_attention=True,
    torch_dtype=torch.float16
).to(device)

  from .autonotebook import tqdm as notebook_tqdm
Override attn_implementation='sdpa' to 'eager' as use_memory_efficient_attention=True


In [235]:
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any


class BaseTool(ABC):
    name: str = ''
    description: str = ''
    parameters: Union[List[dict], dict] = []

    def __init__(self, cfg: Optional[dict] = None):
        self.cfg = cfg or {}
        if not self.name:
            raise ValueError(
                f'You must set {self.__class__.__name__}.name, either by @register_tool(name=...) or explicitly setting {self.__class__.__name__}.name'
            )
        if isinstance(self.parameters, dict):
            if not is_tool_schema({'name': self.name, 'description': self.description, 'parameters': self.parameters}):
                raise ValueError(
                    'The parameters, when provided as a dict, must confirm to a valid openai-compatible JSON schema.')

    @abstractmethod
    def call(self, params: Union[str, dict], **kwargs) -> Union[str, list, dict]:
        """The interface for calling tools.

        Each tool needs to implement this function, which is the workflow of the tool.

        Args:
            params: The parameters of func_call.
            kwargs: Additional parameters for calling tools.

        Returns:
            The result returned by the tool, implemented in the subclass.
        """
        raise NotImplementedError

    @property
    def function_info(self) -> dict:
        return {
            'name': self.name,
            'description': self.description,
            'parameters': self.parameters
        }


class NewsAPITool(BaseTool):
    name: str = 'news_api_search'
    description: str = 'Search for news using NewsAPI. Returns latest news articles with direct links.'

    class Config:
        api_key: str = "9efd68b03f504c759af44000a347b287"
        max_retries: int = 2

    parameters: dict = {
        'type': 'object',
        'properties': {
            'query': {
                'type': 'string',
                'description': 'Search query (e.g. "Ukraine Russia ceasefire")'
            },
            'language': {
                'type': 'string',
                'description': 'Language code (e.g. "ru", "en")',
                'default': 'en'
            },
            'max_results': {
                'type': 'integer',
                'description': 'Number of results (1-30)',
                'default': 3
            },
            'sort_by': {
                'type': 'string',
                'description': 'Sorting method: "relevancy", "popularity", or "publishedAt"',
                'default': 'publishedAt'
            }
        },
        'required': ['query'],
    }

    def call(self, params: dict) -> List[Dict[str, Any]]:
        """Execute news search via NewsAPI"""
        query = params['query']
        language = params.get('language', 'en')
        max_results = min(params.get('max_results', 3), 30)
        sort_by = params.get('sort_by', 'publishedAt')

        for attempt in range(self.Config.max_retries + 1):
            try:
                url = (
                    f"https://newsapi.org/v2/everything?"
                    f"q={query}&"
                    f"language={language}&"
                    f"pageSize={max_results}&"
                    f"sortBy={sort_by}&"
                    f"apiKey={self.Config.api_key}"
                )

                response = requests.get(url, timeout=10)
                response.raise_for_status()
                data = response.json()

                if data['status'] == 'ok':
                    return self._format_results(data['articles'])

                raise Exception(f"API error: {data.get('message', 'Unknown error')}")

            except Exception as e:
                if attempt == self.Config.max_retries:
                    return [{
                        'error': f"NewsAPI search failed after {self.Config.max_retries} attempts",
                        'details': str(e)
                    }]
                continue

    def fetch_full_article_text(self, url: str) -> str:
        try:
            response = requests.get(url, timeout=10, headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
            })
            soup = BeautifulSoup(response.text, "html.parser")

            # Try main content blocks
            for tag in ["article", "main"]:
                container = soup.find(tag)
                if container:
                    paragraphs = container.find_all("p")
                    if paragraphs:
                        return "\n".join(p.get_text(strip=True) for p in paragraphs)

            # Fallback to all <p> tags
            paragraphs = soup.find_all("p")
            return "\n".join(p.get_text(strip=True) for p in paragraphs)

        except Exception as e:
            return f"[Failed to fetch article text: {e}]"

    def _format_results(self, articles: List[Dict]) -> List[Dict[str, Any]]:
        """Standardize NewsAPI response and fetch full article text"""
        formatted_json = []
        formatted_text = []
        for article in articles:
            full_text = self.fetch_full_article_text(article["url"])

            formatted_json.append({
                "title": article["title"],
                "source": article["source"]["name"],
                "url": article["url"],
                "published_at": article["publishedAt"],
                "description": article.get("description"),
                "image_url": article.get("urlToImage")
            })
            formatted_text.append(full_text)

        return formatted_json, formatted_text

    def format_for_display(self, results: List[Dict[str, Any]]) -> str:
        """User-friendly results formatting"""
        if results and 'error' in results[0]:
            return f"Error: {results[0]['error']}\n{results[0].get('details', '')}"

        return '\n\n'.join(
            f"{item['title']}\n"
            f"Source:   {item['source']}\n"
            f"Date:     {item['published_at']}\n"
            f"URL:      {item['url']}\n"
            f"{item.get('description', 'No description')}"
            for item in results
        )
from langchain.text_splitter import RecursiveCharacterTextSplitter


from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from langchain.text_splitter import RecursiveCharacterTextSplitter

class RagTool(BaseTool):
    name: str = 'rag_search'
    description: str = 'RAG search with NewsAPI data. Returns most relevant chunks.'

    class RagConfig:
        embed_model = model
        tokenizer = tokenizer
        dims = 768
        chunk_size = 256
        chunk_overlap = 20

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        qdrant_client = QdrantClient(host="localhost", port=6333)
        collection_name = "news_articles"

    parameters: dict = {
        'type': 'object',
        'properties': {
            'queries': {
                'type': 'list',
                'description': 'New generated queries for multi-rag.',
            },
        },
        'required': ['queries'],
    }
    
    def call(self, args):
        docs = []
        rag_results = []
        data = args['page_text']
        multi_query = args['queries']

        content = data

        for idx, item_content in enumerate(content['text']):
            text = self._join_words(item_content)
            chunks = self.RagConfig.text_splitter.create_documents(
                [text],
                metadatas=[content['json_data'][idx]]
            )
            docs.extend(chunks)

        embeddings = self._embed_documents(docs)

        self.RagConfig.qdrant_client.recreate_collection(
            collection_name=self.RagConfig.collection_name,
            vectors_config=VectorParams(
                size=self.RagConfig.dims,
                distance=Distance.COSINE
            )
        )

        self.RagConfig.qdrant_client.upsert(
            collection_name=self.RagConfig.collection_name,
            points=[
                {
                    "id": i,
                    "vector": emb.tolist(),
                    "payload": {"text": doc.page_content, **doc.metadata}
                }
                for i, (emb, doc) in enumerate(embeddings)
            ]
        )
        for query in multi_query:
            rag_results.append(self.search(query=query, top_k=4))
            
        return rag_results
    
    def search(self, query: str, top_k: int = 5):
        """Perform a vector similarity search for a given query string."""
        # 1. Эмбеддим запрос
        inputs = self.RagConfig.tokenizer(
            query,
            truncation=True,
            max_length=512,
            padding=True,
            return_tensors='pt'
        ).to(device)

        with torch.inference_mode():
            output = self.RagConfig.embed_model(**inputs)

        if hasattr(output, "pooler_output") and output.pooler_output is not None:
            query_vector = output.pooler_output.squeeze(0).cpu().numpy()
        else:
            query_vector = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()

        # 2. Поиск через Qdrant
        results = self.RagConfig.qdrant_client.search(
            collection_name=self.RagConfig.collection_name,
            query_vector=query_vector,
            limit=top_k,
            with_payload=True
        )

        # 3. Возвращаем текст + метаданные
        return [
            {
                "score": hit.score,
                "text": hit.payload.get("text"),
                "meta": {k: v for k, v in hit.payload.items() if k != "text"}
            }
            for hit in results
        ]


    def _embed_documents(self, docs):
        results = []
        for doc in docs:
            inputs = self.RagConfig.tokenizer(
                doc.page_content,
                truncation=True,
                max_length=512,
                padding=True,
                return_tensors='pt'
            ).to(device)

            with torch.inference_mode():
                output = self.RagConfig.embed_model(**inputs)

            if hasattr(output, "pooler_output") and output.pooler_output is not None:
                emb = output.pooler_output.squeeze(0)
            else:
                emb = output.last_hidden_state.mean(dim=1).squeeze(0)


            results.append((emb.cpu().numpy(), doc))
        return results

    def _join_words(self, sequence):
        if isinstance(sequence, list):
            return " ".join(sequence)
        return sequence

In [93]:
asd = RagTool()
ff = asd.call(test_message)

In [95]:
len(ff)

83

In [96]:
ff[2]

Document(metadata={'title': 'Banning teens from social media will not fix everything. Here’s why', 'source': 'The-independent.com', 'url': 'https://www.the-independent.com/life-style/health-and-families/social-media-ban-teenager-addiction-b2752278.html', 'published_at': '2025-05-16T09:35:10Z', 'description': 'Jasleen Chhabra, Vita Pilkington and Zac Seidler on the complexities of a social media ban for teenagers', 'image_url': 'https://static.the-independent.com/2025/02/25/15/11/iStock-2180811155.jpeg?trim=0,180,0,180&width=1200&height=800&crop=1200:800'}, page_content='Platforms such as TikTok, Snapchat and Instagram are where young people connect with friends and online communities, explore and express their identities, seek information, and find support for mental health struggles.')

In [236]:
import json
import re
from typing import Optional, Dict

def extract_tool_call(response: str) -> Optional[Dict[str, Any]]:
    json_part = response.split('</think>')[-1].strip()

    try:
        data = json.loads(json_part.strip())
        if isinstance(data, dict) and "name" in data and "arguments" in data:
            return {
                "name": data["name"],
                "arguments": data["arguments"]
            }
    except json.JSONDecodeError:
        pass

In [237]:
from typing import Dict, Iterator, List, Optional, Tuple, Union, Literal
def is_tool_schema(obj: dict) -> bool:
    if not isinstance(obj, dict):
        return False

    if 'name' not in obj or not isinstance(obj['name'], str):
        return False
    if 'description' not in obj or not isinstance(obj['description'], str):
        return False

    params = obj.get('parameters')
    if not isinstance(params, dict):
        return False
    if params.get('type') != 'object':
        return False
    if 'properties' not in params or not isinstance(params['properties'], dict):
        return False
    if 'required' not in params or not isinstance(params['required'], list):
        return False

    return True
    
gs = NewsAPITool()
import json
rt = RagTool()

def execute_tools(tool_call) -> list:
    results = {}
    tool_name = tool_call["name"]
    args = tool_call["arguments"]

    if isinstance(args, str):
        try:
            args = json.loads(args)
        except Exception as e:
            results = {
                "tool": tool_name,
                "status": "error",
                "message": f"Failed to parse arguments: {e}"
            }
    try:
        if tool_name == "news_api_search":
            json_data, page_text = gs.call(args)
            results = {
                "tool": tool_name,
                "status": "success",
                "json_data": json_data,
                "text": page_text,
            }
        elif tool_name == "rag_search":
            rag_results = rt.call(args)
            results = {
                "tool": tool_name,
                "status": "success",
                "data": rag_results,
            }
        else:
            results = {
                "tool": tool_name,
                "status": "error",
                "message": f"Unknown tool: {tool_name}"
            }
    except Exception as e:
        results = {
            "tool": tool_name,
            "status": "error",
            "message": str(e)
        }

    return results

In [238]:
SYSTEM_PROMPT = """Generate ONLY this JSON structure ONLY when query is related for news searches, otherwise dont use it:
{
  "name": "news_api_search",
  "arguments": {
    "query": "optimized_search_query",
    "language": "en",
    "max_results": 3
  }
}"""

In [239]:
class AgentMemory(BaseModel):
    # Logicaly it seems better to store this class as an item in the session store.
    # We can have different configs for different memorys.
    memory_buffer: List[Message] = Field(default_factory=list)
    max_memory_size: int = Field(default=10)

class AgentSessionTracker(BaseModel):
    session_id: str = Field(..., description="Unique session ID")
    user_id: str = Field(default_factory=str)
    session_store: Dict[str, List] = Field(default_factory=dict)
    
    # Initializing the agent memory
    agent_memory: AgentMemory = Field(default_factory=AgentMemory)
    
    @property
    def memory_buffer(self) -> List[Message]:
        if self.agent_memory.memory_buffer is None:
            print(f"[Session: {self.session_id}] Memory buffer was empty. Creating a new one.")
            self.agent_memory.memory_buffer = []
        return self.agent_memory.memory_buffer

    def _check_memory_size(self):
        while len(self.memory_buffer) > self.agent_memory.max_memory_size:
            self.memory_buffer.pop(0)
            raise Warning(
                f"[Session: {self.session_id}] Memory buffer size exceeded. "
                f"Removing the oldest message to maintain the limit of {self.agent_memory.max_memory_size}."
            )

    def add_message(self, messages: Union[Message, List[Message]]):
        if isinstance(messages, list):
            if not all(isinstance(m, Message) for m in messages):
                raise ValueError("All items in the list must be of type Message.")
        elif isinstance(messages, Message):
            messages = [messages]
        else:
            raise ValueError("'messages' must be of type Message or List[Message].")
        self.memory_buffer.extend(messages)
        self._check_memory_size()
    
    def clean_memory(self):
        self.agent_memory.memory_buffer = []
    
    def get_by_session_id(self, session_id: str) -> Optional[AgentMemory]:
        if session_id not in self.session_store:
            self.session_store[session_id] = self.memory_buffer
        return self.session_store[session_id]

In [240]:
import os
os.environ["GGML_VERBOSE"] = "1"

In [241]:
import uuid
import copy

class Agent:
    def __init__(self, llm, max_steps=3):
        self.llm = llm
        self.max_steps = max_steps

    def _build_context(self, memory_buff: List[Message]):
        return [msg.model_dump() for msg in memory_buff]

    def run(self, query: str) -> str:
        session_id = str(uuid.uuid4())
        memory_tracker = AgentSessionTracker(session_id=session_id)
        SYSTEM_PROMPT_MULTI_QUERY = None

        messages = [
            SystemMessage(content=SYSTEM_PROMPT, role="system"),
            UserMessage(content=query, role="user"),
        ]

        for step in range(self.max_steps):
            sys_message = []
            memory_tracker.add_message(messages)
            context = self._build_context(memory_tracker.memory_buffer)
            
            response = self.llm.create_chat_completion(
                messages=context,
                temperature=0,
                max_tokens=1000,
                response_format={"type": "json_object"},
            )
            content = response['choices'][0]['message']['content']
            print(f"\nStep {step} response:\n{content}")
            
            if tool_call := extract_tool_call(content):
                print(f"🛠️ Executing tool: {tool_call['name']} with args: {tool_call['arguments']}")

                
                
                formatted_results = []
                if tool_call["name"] == "news_api_search":
                    tool_results = execute_tools(tool_call)

                    if tool_results["status"] == "success":
                        formatted = f"Article content:\n{tool_results['json_data']}"
                        formatted_results.append(formatted)

                        # Подготовка промпта для rag_search
                        MULTI_QUERY_CALL_PROMPT = """
                    You are an AI language model assistant. Your task is to generate five different versions of the given user query.

                    You must respond with ONLY the following JSON format — do not add explanations, markdown, or natural language:

                    {{
                    "name": "rag_search",
                    "arguments": {{
                        "queries": [
                        "reformulated_query_1",
                        "reformulated_query_2",
                        "reformulated_query_3",
                        "reformulated_query_4",
                        "reformulated_query_5"
                        ]
                    }}
                    }}

                    Original query: {query}
                    """.strip()


                        messages = [
                            LlmMessage(content=content, role="assistant", tool_calls=[tool_results]),
                            UserMessage(content="Tool execution results:\n" + "\n\n".join(formatted_results), role="user"),
                            SystemMessage(content=MULTI_QUERY_CALL_PROMPT.format(query=query), role="system")
                        ]
                        
                if tool_call["name"] == "rag_search":
                    rag_call = copy.deepcopy(tool_call)
                    rag_call['arguments']['page_text'] = tool_results
                    tool_results = execute_tools(rag_call)
                    if tool_results["status"] == "success":
                        formatted_results.append(tool_results['data'])
                
            else:
                return tool_results

        return tool_results

In [242]:
ags = Agent(llm=llm, max_steps=2)

In [243]:
query = "give me the latest insights from crypto's and stock's world"   

### Tool analyzer 
Add TOOL analyzer, TOOL tracker
we need to track tool calling for the sake of logging
When model need to anylyze answers from buffer, it needs to detect tool easyly just by key 'Role' 

In [244]:
mbmb = ags.run(query)

Llama.generate: 85 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =     857.19 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    5095.82 ms /    37 runs   (  137.72 ms per token,     7.26 tokens per second)
llama_perf_context_print:       total time =    5988.82 ms /    38 tokens



Step 0 response:
{"name": "news_api_search", "arguments": {"query": "cryptocurrency and stock market latest insights", "language": "en", "max_results": 3}}

🛠️ Executing tool: news_api_search with args: {'query': 'cryptocurrency and stock market latest insights', 'language': 'en', 'max_results': 3}


Llama.generate: 122 prefix-match hit, remaining 690 prompt tokens to eval
llama_perf_context_print:        load time =     857.19 ms
llama_perf_context_print: prompt eval time =    1843.17 ms /   690 tokens (    2.67 ms per token,   374.35 tokens per second)
llama_perf_context_print:        eval time =    7818.14 ms /    60 runs   (  130.30 ms per token,     7.67 tokens per second)
llama_perf_context_print:       total time =   11032.06 ms /   750 tokens



Step 1 response:
{"name": "rag_search", "arguments": {"queries": ["latest cryptocurrency and stock market insights", "recent updates on crypto and stock trends", "current developments in crypto and stock markets", "newest news about cryptocurrencies and stocks", "up-to-date analysis of crypto and stock world"]}}

🛠️ Executing tool: rag_search with args: {'queries': ['latest cryptocurrency and stock market insights', 'recent updates on crypto and stock trends', 'current developments in crypto and stock markets', 'newest news about cryptocurrencies and stocks', 'up-to-date analysis of crypto and stock world']}


  self.RagConfig.qdrant_client.recreate_collection(
  results = self.RagConfig.qdrant_client.search(


In [37]:
test_message = LlmMessage(content="Hello", role="assistant", tool_calls=[mbmb])

In [251]:
mbmb['data'][0]

[{'score': 0.56748736,
  'text': 'Google Finance supports real-time data for popular cryptocurrencies like Bitcoin, Ethereum, and Solana, along with major fiat currencies like the USD, EUR, and GBP.',
  'meta': {'title': 'How Google Finance Can Help You Manage Your Investments in 2025',
   'source': 'Typeforyou.org',
   'url': 'https://typeforyou.org/google-finance/',
   'published_at': '2025-05-11T16:00:08Z',
   'description': 'In today’s dynamic financial landscape, staying on top of your investments is more important than ever. With markets constantly shifting and new asset classes like...\nThe post How Google Finance Can Help You Manage Your Investments in 2025 first appeared on T…',
   'image_url': 'https://typeforyou.org/wp-content/uploads/2025/05/Screenshot-2025-05-11-180531.png'}},
 {'score': 0.5485753,
  'text': 'In today’s dynamic financial landscape, staying on top of your investments is more important than ever. With markets constantly shifting and new asset classes like cr

In [None]:
mbmb['js']

KeyError: 'json_data'

In [64]:
radi_boga

[[Document(metadata={}, page_content='B'),
  Document(metadata={}, page_content='r'),
  Document(metadata={}, page_content='u'),
  Document(metadata={}, page_content='c'),
  Document(metadata={}, page_content='e'),
  Document(metadata={}, page_content='S'),
  Document(metadata={}, page_content='p'),
  Document(metadata={}, page_content='r'),
  Document(metadata={}, page_content='i'),
  Document(metadata={}, page_content='n'),
  Document(metadata={}, page_content='g'),
  Document(metadata={}, page_content='s'),
  Document(metadata={}, page_content='t'),
  Document(metadata={}, page_content='e'),
  Document(metadata={}, page_content='e'),
  Document(metadata={}, page_content='n'),
  Document(metadata={}, page_content='&'),
  Document(metadata={}, page_content='a'),
  Document(metadata={}, page_content='m'),
  Document(metadata={}, page_content='p'),
  Document(metadata={}, page_content=';'),
  Document(metadata={}, page_content='T'),
  Document(metadata={}, page_content='h'),
  Document(

## TOOL IDEA
code tool that will take only hot key moments from the news page, and then give to a user Connection beatween hot-takes and the actual page.


### Check notes about storing memory

In [None]:
response = llm.create_chat_completion(
    messages=messages,
    temperature=0.1,
    max_tokens=400,
    stream=True,
)

In [None]:
for chunk in response:
    if 'choices' in chunk and len(chunk['choices']) > 0:
        content = chunk['choices'][0]['delta'].get('content', '')
        if content:
            print(content, end='', flush=True)
    else:
        print("No valid response received.")

In [None]:
sss = LlmMessage(
    role="assistant",
    content="I am an AI assistant. How can I help you?",
)
sss2 = UserMessage(
    role="user",
    content="What is the latest update on stock market trends?",
)
sss3 = SystemMessage(
    role="system",
    content="Generate ONLY this JSON structure ONLY when query is related for news searches, otherwise dont use it: {\"name\": \"news_api_search\", \"arguments\": {\"query\": \"optimized_search_query\", \"language\": \"en\", \"max_results\": 3}}"
)

In [None]:
SystemMessage(content=SYSTEM_PROMPT, role="system").model_dump()

{'role': 'system',
 'content': 'Generate ONLY this JSON structure ONLY when query is related for news searches, otherwise dont use it:\n{\n  "name": "news_api_search",\n  "arguments": {\n    "query": "optimized_search_query",\n    "language": "en",\n    "max_results": 3\n  }\n}'}

In [None]:
message_1 = {"role": "system", "content": SYSTEM_PROMPT}
message_2 = {"role": "user", "content": "What is the latest update on stock market trends?"}

In [None]:
response = llm.create_chat_completion(
    messages=[sss3.model_dump(), sss.model_dump(), sss2.model_dump()],
    temperature=0.1,
    max_tokens=400,
    stream=True,
)

In [None]:
print(tokenizer.apply_chat_template([sss3.model_dump(), sss.model_dump(), sss2.model_dump()], tokenize=False))

<|im_start|>system
Generate ONLY this JSON structure ONLY when query is related for news searches, otherwise dont use it: {"name": "news_api_search", "arguments": {"query": "optimized_search_query", "language": "en", "max_results": 3}}<|im_end|>
<|im_start|>assistant
I am an AI assistant. How can I help you?<|im_end|>
<|im_start|>user
What is the latest update on stock market trends?<|im_end|>



In [None]:
for chunk in response:
    if 'choices' in chunk and len(chunk['choices']) > 0:
        content = chunk['choices'][0]['delta'].get('content', '')
        if content:
            print(content, end='', flush=True)
    else:
        print("No valid response received.")

Llama.generate: 21 prefix-match hit, remaining 70 prompt tokens to eval


<think>
Okay, the user is asking about the latest update on stock market trends. I need to figure out how to respond. Let me check the instructions again. The user mentioned generating a JSON structure only when the query is related to news searches. Otherwise, don't use it.

So, the query here is about stock market trends. That's a news-related topic. Therefore, I should generate the JSON structure. The JSON should have the name "news_api_search" with arguments including the query, language set to English, and max_results as 3. 

Wait, the user's query is "latest update on stock market trends." I need to make sure the optimized_search_query is correctly formatted. Maybe use keywords like "stock market trends latest" to get the most relevant results. Also, check that the language is "en" and max_results is 3. 

I should structure the JSON exactly as specified. Let me double-check the syntax to avoid any errors. The keys should be in quotes, and the structure should be correct. Alright,

llama_perf_context_print:        load time =     859.90 ms
llama_perf_context_print: prompt eval time =     734.98 ms /    70 tokens (   10.50 ms per token,    95.24 tokens per second)
llama_perf_context_print:        eval time =   38535.33 ms /   253 runs   (  152.31 ms per token,     6.57 tokens per second)
llama_perf_context_print:       total time =   39683.94 ms /   323 tokens
