In [1]:
# By chat model we mean LLM model which operates with chats 
import os 
import json
from openai import OpenAI

import threading

from langchain_openai import ChatOpenAI 
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate

from pydantic import BaseModel, Field
from typing import Literal , Optional

## check performance in agent workflow 
from typing import TypedDict, Annotated, List, Dict, Optional
from langchain_core.messages import BaseMessage, AnyMessage, ToolMessage,HumanMessage, AIMessage, SystemMessage
from langgraph.graph import add_messages , START, END , StateGraph
from IPython.display import Image, display 
from langgraph.checkpoint.memory import MemorySaver

from googleapiclient.discovery import build
from urllib.parse import urlparse

# prepare model for embeddings  
from langchain_openai import OpenAIEmbeddings 
from langchain_core.tools import tool, StructuredTool


from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 
import numpy.typing as npt

from tavily import TavilyClient 

import subprocess
from IPython.display import Image, display


## OpenAI configuration
api_key_var = os.environ.get("OPENAI_API_KEY")
#print("OpenAI API: " , api_key_var)


# keys for tavily 
tavily_api = os.environ.get("TAVILY_API_KEY")
#print("Tavily API: " , tavily_api)

# api for goole
api_google = os.environ.get('GOOGLE_API_KEY')
#print("Google API: " , api_google)

# define search engine
SEARCH_ENGINE_ID = '26dda816634bd4044'





In [2]:
# https://docs.tavily.com/documentation/api-reference/endpoint/search

tavily_client =  TavilyClient(tavily_api)
query = "Lukoil money laundering"
search_parameters = {
    "topic" : "news",  # payload must be created by LLM List[news, finance, general]
    "search_depth" : "basic", 
    "chunks_per_source" : 1 , 
    "max_results" : 3 , 
    "time_range" : 'y', # use y for 1 year,  3 produces BadRequestError: Invalid time range. Must be 'day', 'week', 'month', 'year' (or 'd', 'w', 'm', 'y').
    #"include_answer" :  True , # summary generated by llm based on search results, skip it
    "include_images" :  False , 
    # include_domains =  [] ,
    # exclude_domains = [] , 
    # country
}

response = tavily_client.search( query=query, **search_parameters )


In [3]:
response

{'query': 'Lukoil money laundering',
 'follow_up_questions': None,
 'answer': None,
 'images': [],
 'results': [{'url': 'https://www.reuters.com/business/energy/russias-lukoil-says-it-intends-sell-international-assets-due-western-sanctions-2025-10-27/',
   'title': "Russia's Lukoil says it plans to sell international assets due to Western sanctions - Reuters",
   'score': 0.4600226,
   'published_date': 'Mon, 27 Oct 2025 19:50:16 GMT',
   'content': "MOSCOW, Oct 27 (Reuters) - Russia's second-largest oil producer, Lukoil (LKOH.MM), opens new tab, said on Monday it would sell its international assets following sanctions over Ukraine announced last week by the United States. On October 22, U.S. President Donald Trump imposed Ukraine-related sanctions on Russia's largest oil companies, Lukoil and Rosneft (ROSN.MM), opens new tab. REUTERS/Maxim Shemetov Purchase Licensing Rights, opens new tab Our Standards: The Thomson Reuters Trust Principles., opens new tab *   About Reuters, opens new 

In [4]:
response.get("results")  # list with dictionaries
# we need to extract domain 

[{'url': 'https://www.reuters.com/business/energy/russias-lukoil-says-it-intends-sell-international-assets-due-western-sanctions-2025-10-27/',
  'title': "Russia's Lukoil says it plans to sell international assets due to Western sanctions - Reuters",
  'score': 0.4600226,
  'published_date': 'Mon, 27 Oct 2025 19:50:16 GMT',
  'content': "MOSCOW, Oct 27 (Reuters) - Russia's second-largest oil producer, Lukoil (LKOH.MM), opens new tab, said on Monday it would sell its international assets following sanctions over Ukraine announced last week by the United States. On October 22, U.S. President Donald Trump imposed Ukraine-related sanctions on Russia's largest oil companies, Lukoil and Rosneft (ROSN.MM), opens new tab. REUTERS/Maxim Shemetov Purchase Licensing Rights, opens new tab Our Standards: The Thomson Reuters Trust Principles., opens new tab *   About Reuters, opens new tab *   Reuters News Agency, opens new tab *   Reuters and AI, opens new tab *   Reuters Leadership, opens new tab 

In [6]:
from urllib.parse import urlparse

url = "https://www.themoscowtimes.com/2025/11/07/gunvor-pulls-22b-lukoil-deal-after-us-labels-company-kremlin-puppet-a91071"

# Extract base URL (scheme + domain)
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
#print(base_url)  # Output: https://www.themoscowtimes.com

def extract_domain(url:str) -> str: 
    parsed = urlparse(url)
    displayLink = f"{parsed.scheme}://{parsed.netloc}"
    return displayLink

extract_domain(url)


'https://www.themoscowtimes.com'

In [7]:
## the output from other search engines must follow LinkCollection pattern
## specifically it return to tavily extraction node
## therefore we also need to trace what search functions were useed for search
## so we need additional class attribute to track what search functions were triggered

class TavilyLinkCollection(BaseModel): 
      displayLink: str = Field("yrl")

class TavilyLinkCollection(BaseModel):
       displayLink: str = Field(description="The display URL shown in search results (usually domain name)")
       link: str = Field(description="The full URL of the search result")
       raw_content: str = Field(default="", description="Content extracted from URL")
       summary: str = Field(default="", description="Summary of content extracted from URL")
       claim_type: str = Field(default="other", description="Each article must fit specific only 1 claim type")
       date_published: Optional[str] = Field( default=None, description="Publication date extracted from content (format: YYYY-MM-DD, or 'Unknown' if not found)" )
       


In [None]:
# common error BadRequestError: Max 20 URLs are allowed.
# https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#try-it 

class TavilySearchSchema(BaseModel):
    query: str = Field(description="Search query")
    
    topic: Literal['news', 'finance', 'general'] = Field( default='general',
        description="Search topic: 'news' for news articles, 'finance' for financial data, 'general' for web search"
    )
    
    time_range: Literal['day', 'week', 'month', 'year', 'd', 'w', 'm', 'y'] = Field(default='month',
        description="Time range filter: 'day'/'d', 'week'/'w', 'month'/'m', 'year'/'y'"
    )
    
    search_depth: Literal['basic', 'advanced'] = Field(default='basic',
        description="Search depth: 'basic' for quick results, 'advanced' for comprehensive search"
    )
    
    chunks_per_source: int = Field(default=1,
        description="Number of content chunks to extract per source"
    )
    
    max_results: int = Field(default=10,
        description="Maximum number of search results to return"
    )
    
    include_answer: bool = Field(default=False,
        description="Whether to include AI-generated answer summary"
    )

    hl: str = Field(default="en", 
        description="Interface language: ro, en, ru, fr, de")


## define function for tool schema
def tavily_search( # positional come first
            query: str,
            hl:str, 
            topic: str,                       # Moved up (required)
            time_range: str,                  # Moved up (required)
            search_depth: str,                # Moved up (required)
            chunks_per_source: int = 1,       # Keyword args after
            max_results: int = 10,
            include_answer: bool = False
        ):

    """
    Search Information using Tavily Search tool.
    
    This function performs search using Tavily search engine specifically designed for AI agents.
     Agent Customizable Parameters can be adjusted by agent.
    
    Parameters:
    -----------
    include_answer : boolean  
        Include an LLM-generated answer to the provided query. Always False.
    chunks_per_source : int
        Chunks are short content snippets (maximum 500 characters each) pulled directly from the source. Always equal to 1.  
    max_results:int 
        The maximum number of search results to return. Always equal to 10. 

    Agent Customizable Parameters:
    -----------------------------
    query : str
        The search query to execute with Tavily. Query is passed in HumanMessage

    search_depth:str 
        The depth of the search. advanced search is tailored to
        retrieve the most relevant sources and content snippets for your query, 
        while basic search provides generic content snippets from each source.   
    
    topic : str, The category of the search.
            Available options: general, news, finance 

    time_range: str, The time range back from the current date to filter results based on publish date or last updated date.
            Available options: day, week, month, year, d, w, m, y 

    hl : str, always included (default: "en")
        Interface language - controls UI language and affects search quality
        Examples: "en", "ru", "de", "fr","ro"        

    Returns:
    --------
    list
        List of dictionaries containing search results with keys:
        - 'link': Result URL  
        - 'displayLink' : domain
    
    """

    print("Executing tavily_search_payload ...")

    api_key = tavily_api
    tavily_client =  TavilyClient(tavily_api)
    
    ## store search results 
    all_results = []
   
    ## query language must be aligned with search parameters
    translated_query = translate_query_for_search(query , hl , llm_translation_or_terms)

    try:
        result1= tavily_client.search( 
        query: translated_query,
        hl:hl, 
        topic: topic,                     
        time_range: str,                  
        search_depth: str,                
        chunks_per_source: int = 1,       
        max_results: int = 10,
        include_answer: bool = False
        )

        if 'results' in result1:
                # Extract only relevant information from each result
                filtered_items = []
                for item in result1['results']:
                    domain = extract_domain( item.get("url") )
                    essential_data = {
                        'query':translated_query,
                        'link': item.get('title', ''),
                        'displayLink': domain             
                    }
                    filtered_items.append(essential_data)

                all_results.extend(filtered_items)
    
    except Exception as e:
        print(f"Error getting results: {e}")

    print("Executing tavily_search_payload DONE ")    

    return all_results

