# **Relevance Classifier**

**Model Requirement**: Llama 3.2 3B

**Task:** Design a prompt that evaluates if a given research paper title and abstract is relevant to the user query on a scale of 1-4 (low to high).

**Instructions:**
1. Define clear criteria for each relevance level (1-4)
2. Consider paper title, abstract, and user question as input.
3. Provide structured output with justification.
4. You can take random title, abstract and queries from any internet sources.

In [1]:
%%capture
# for openai api structures
!pip install openai
!pip install chromadb


In [2]:
# custom exception handler
import traceback
import os
import sys
async def handle_exception(exception : Exception):
    exception_type = type(exception).__name__
    exception_message = str(exception)
    exception_traceback = traceback.extract_tb(exception.__traceback__)
    line_number = exception_traceback[-1].lineno
    print(f"Exception Type: {exception_type}")
    print(f"Exception Message: {exception_message}")
    print(f"Line Number: {line_number}")
    print("Full Traceback:")
    print("".join(traceback.format_tb(exception.__traceback__)))

    return {'error': str(exception)}


Retry on rate limiting due to limited concurrent requests which will eventually solve future production bug

In [3]:
from typing import List, Dict, Union, Any, Optional,AsyncGenerator,Tuple
from openai import AsyncOpenAI
import json
import re

llama3b = 'meta-llama/Llama-3.2-3B-Instruct'
hf_base_url = 'https://api-inference.huggingface.co/v1/'

# meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
# "https://api.deepinfra.com/v1/openai"


class GenResponse:
    # default model ID
    DEFAULT_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

    def __init__(self, api_key: str,base_url : str, openai:bool = False):
        if not api_key or not isinstance(api_key, str):
            raise ValueError("API key is required and must be a string.")

        if (not base_url or not isinstance(base_url, str)) and not openai:
            raise ValueError("Base Url is required and must be a string.")

        self.key = api_key
        self.client = AsyncOpenAI(
            api_key=self.key,
            base_url = base_url,
        ) if not openai else AsyncOpenAI(api_key= api_key)

    async def get_response(
        self,
        query: str,
        system: str,
        chat_history: Optional[List[Dict]] = None,
        model_id: Optional[str] = DEFAULT_MODEL_ID,
        temperature: float = 0.5,
        max_tokens: int = 500,
        top_p: float = 0.95,
        top_k: int = 500,
        frequency_penalty: float = 0,
        presence_penalty: float = 0,
        stop: Optional[List[str]] = None,
        response_format: Optional[Dict] = None,
        tools: Optional[List[Dict]] = None,
    ) -> Union[str, Dict[str, Any]]:

        # required inputs
        if not query or not isinstance(query, str):
            raise ValueError("Query is required and must be a string.")
        if not system or not isinstance(system, str):
            raise ValueError("System is required and must be a string.")
        if chat_history is not None and not isinstance(chat_history, list):
            raise TypeError("Chat history must be a list of dictionaries.")

        # chat history if not provided
        if chat_history is None:
            chat_history = []


        messages = [
            {"role": "system", "content": system},
            *chat_history,
            {"role": "user", "content": query},
        ]

        try:

            response = await self.client.chat.completions.create(
                model=model_id,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p,
                response_format=response_format,
                messages=messages,
                tools=tools,
            )

            return response



        except Exception as e:
          error = await handle_exception(e)
          return error

    async def get_stream_response(
        self,
        query: str,
        system: str,
        chat_history: Optional[List[Dict]] = None,
        model_id: Optional[str] = DEFAULT_MODEL_ID,
        temperature: float = 0.5,
        max_tokens: int = 500,
        top_p: float = 0.95,
        top_k: int = 1000,
        stop: Optional[List[str]] = None,
        response_format: Optional[Dict] = None,
    ) -> AsyncGenerator:

        # required inputs
        if not query or not isinstance(query, str):
            raise ValueError("Query is required and must be a string.")
        if not system or not isinstance(system, str):
            raise ValueError("System is required and must be a string.")
        if chat_history is not None and not isinstance(chat_history, list):
            raise TypeError("Chat history must be a list of dictionaries.")

        # chat history if not provided
        if chat_history is None:
            chat_history = []


        messages = [
            {"role": "system", "content": system},
            *chat_history,
            {"role": "user", "content": query},
        ]

        try:

            response = await self.client.chat.completions.create(
                model=model_id,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p,
                messages=messages,
                stream=True,
            )

            full_content = ''
            async for event in response:
              if event.choices[0].finish_reason:
                print(event.choices[0].finish_reason)
                yield full_content

              else:
                print(event.choices[0].delta.content)
                full_content += event.choices[0].delta.content

                yield event.choices[0].delta.content


        except Exception as e:
          error = await handle_exception(e)
          yield error

    async def format_sse(self,data: str, event: Optional[str] = None) -> str:
      """Formats the data into a Server-Sent Events (SSE) format.

      Args:
        data (str): The data to be formatted.
        event (Optional[str]): The event type (e.g., 'event' or 'body'). Defaults to None.

      Returns:
        str: The formatted SSE data.

      Example:
        >>> await format_sse("Hello, world!", "text")
        'event: text\ndata: "Hello, world!"\n\n'
         """
      formatted_data = f'data: {json.dumps(data)}\n\n'

      if event:
        formatted_data = f'event: {event}\n{formatted_data}'

      return formatted_data

    async def extract_xml(self,text: str, tag: str) -> str:
      """
      Extracts the content of the specified XML tag from the given text. Used for parsing structured responses
      Args:
        text (str): The text containing the XML.
        tag (str): The XML tag to extract content from.
      Returns:
        str: The content of the specified XML tag, or an empty string if the tag is not found.
      """
      match = re.search(f'<{tag}>(.*?)</{tag}>', text, re.DOTALL)
      return match.group(1) if match else None

    async def get_json(self, text: str) -> Dict:
      """
      Extracts the JSON content from the given text.
      Args:
        text (str): The text containing the JSON.
      Returns:
        Dict: The JSON content as a Python dictionary.
      """
      try:
        json_content = json.loads(text)
        return json_content
      except Exception as e:
        error = await handle_exception(e)
        return error

**Research paper Relevance Classifier**

1. **Relevance** - As Relevance have 4 levels. I will go with the minimum relevance or irrelevant to Most relevant (title and Abstract is semantically,keywords,aspect wise ,etc showing most relevance)

2. **Prompt**- As llama 3.2 model has been trained on scientific data .**Zero Shot** prompting should work.


In [4]:
from pydantic import BaseModel, Field

class RCOutput(BaseModel):
    analysis: str = Field(
        description="A detailed analysis of the relevance criteria."
    )
    justification: str = Field(
        description="A brief explanation of why the paper received the given score, based on the alignment of the title, abstract, and user query."
    )
    relevance_level: int = Field(
        description=" A score between 1 and 4 indicating the relevance of the paper to the user query."
    )

output_json = RCOutput.model_json_schema()
print(output_json)

{'properties': {'analysis': {'description': 'A detailed analysis of the relevance criteria.', 'title': 'Analysis', 'type': 'string'}, 'justification': {'description': 'A brief explanation of why the paper received the given score, based on the alignment of the title, abstract, and user query.', 'title': 'Justification', 'type': 'string'}, 'relevance_level': {'description': ' A score between 1 and 4 indicating the relevance of the paper to the user query.', 'title': 'Relevance Level', 'type': 'integer'}}, 'required': ['analysis', 'justification', 'relevance_level'], 'title': 'RCOutput', 'type': 'object'}


In [6]:
# Role prompting, or persona prompting,
# SimToM (Simulated Theory of Mind)
# RaR,R2e

zero_shot_system = """<role>
You are a expert research paper relevance classifier who will always give output in json format. Before classification you will analyze title and abstract of the research paper in multi aspect relative to user query\
.</role>
<task>
Evaluate  and reason the relevance of a given research paper's **title** and **abstract** to a user query on a scale of 1-4 (low to high).
</task>

<inputs>

<title>
{title}
</title>

<abstract>
{abstract}
</abstract>

</inputs>


<guidelines>
Let's think step by step-
1. Rephrase and expand the user query if ambiguous for reasoning.
2. Break user query in multi aspect, analyze all scientific keywords or their semantic meaning,topic  and what actually user is looking for.
3. Analyze title's alignmnet with user query. provide a score between 0 to 1.
4. Analyze abstrcat's alignmnet with user query. provide a score between 0 to 1.
5. Finally provide a relevance_level between 0 to 1 by following this:
<relevance_level>
1 (Low Relevance): The paper title and abstract have no direct connection to the user query. The topic, keywords, and context do not align.
2 (Moderate Relevance): The paper title or abstract contains some related keywords or concepts, but the connection is weak or tangential. The paper may touch on a broader topic without addressing the specific query.
3 (High Relevance): The paper title and abstract align well with the user query. The topic, keywords, and context are closely related, and the paper likely provides useful insights or answers to the query.
4 (Very High Relevance): The paper title and abstract are highly aligned with the user query. The topic, keywords, and context are directly relevant, and the paper is likely to provide a comprehensive answer or solution to the query.
</relevance_level>
</guidelines>

You will always give output in this valid Json format only which will be consume by automated json parsing system:
{{
    "analysis": "Your analysis as per guidelines",
    "justification": "A brief explanation of why the paper received the given score, based on the alignment of the title, abstract, and user query",
    "relevance_level": "A score between 1 and 4 indicating the relevance of the paper to the user query."
}}
"""

async def generate_prompt(title,abstract):
  return zero_shot_system.format(
      title = title,
      abstract = abstract,
  )

In [7]:
paper = ("""Tab-CoT: Zero-shot Tabular Chain of Thought""","""The chain-of-though (CoT) prompting methods
were successful in various natural language processing (NLP) tasks thanks to their ability to
unveil the underlying complex reasoning processes. Such reasoning processes typically exhibit implicitly structured steps. Recent efforts
also started investigating methods to encourage
more explicitly structured reasoning procedures
to be captured (Zhou et al., 2022). In this work,
we propose Tab-CoT, a novel tabular-format
CoT prompting method, which allows the complex reasoning process to be explicitly modelled in a highly structured manner. Despite
its simplicity, we show that our approach is capable of performing reasoning across multiple
dimensions (i.e., both rows and columns). We
demonstrate our approach’s strong zero-shot
and few-shot capabilities through extensive experiments on a range of reasoning tasks.""")


In [8]:
system = await generate_prompt(
    title = paper[0],
    abstract = paper[1],

)
print(system)

<role>
You are a expert research paper relevance classifier who will always give output in json format. Before classification you will analyze title and abstract of the research paper in multi aspect relative to user query.</role>
<task>
Evaluate  and reason the relevance of a given research paper's **title** and **abstract** to a user query on a scale of 1-4 (low to high).
</task>

<inputs>

<title>
Tab-CoT: Zero-shot Tabular Chain of Thought
</title>

<abstract>
The chain-of-though (CoT) prompting methods
were successful in various natural language processing (NLP) tasks thanks to their ability to
unveil the underlying complex reasoning processes. Such reasoning processes typically exhibit implicitly structured steps. Recent efforts
also started investigating methods to encourage
more explicitly structured reasoning procedures
to be captured (Zhou et al., 2022). In this work,
we propose Tab-CoT, a novel tabular-format
CoT prompting method, which allows the complex reasoning process t

In [9]:
from google.colab import userdata
DEEPINFRA_TOKEN = userdata.get('DEEP_INFRA_KEY')
HF_TOKEN = userdata.get('HF_TOKEN')
DEEP_INFRA_BASE_URL = 'https://api.deepinfra.com/v1/openai'

In [10]:
# llm handling class
model = GenResponse(api_key=DEEPINFRA_TOKEN,base_url = DEEP_INFRA_BASE_URL)

async def classify_paper(
    query: str,
    system: str,
    title :str,
    abstract: str,
    llm_handler : GenResponse = model
    )->Dict:

    # system formatting
    system = system.format(
      title = title,
      abstract = abstract,
    )
    response = await llm_handler.get_response(
        system = system,
        query = f"<query>{query}</query>",
        chat_history = [],
        temperature = .5,
        max_tokens = 1000,
        response_format = {"type": "json"}
        )

    if isinstance(response,Dict):
      return response
    # json validate
    json_response = await llm_handler.get_json(response.choices[0].message.content)
    # try with feedback
    if 'error' in json_response:
       print("Feedback")
       feedback = f"This is json parsing error in your last response :\n{json_response['error']}.\nLast Response :\n{response.choices[0].message.content}.\n Correct it and make sure all the keys and values are present in valid json format."
       chat_history = [
           {"role": "system", "content": system},
          {"role": "user", "content": f"<query>{query}</query>"},
          {"role": "assistant", "content": response.choices[0].message.content}
           ]
       response = await llm_handler.get_response(
          system = system,
          query = f"<feedback>{feedback}</feedback>",
          chat_history = chat_history,
          temperature = .5,
          max_tokens = 1000,
          response_format = {"type": "json"}
          )
       json_response = await llm_handler.get_json(response.choices[0].message.content)
    return json_response


In [11]:

model = GenResponse(api_key=DEEPINFRA_TOKEN,base_url = DEEP_INFRA_BASE_URL)
await classify_paper(
    query = 'what is tabular chain of thought',
    system = zero_shot_system,
    title = paper[0],
    abstract = paper[1],
    llm_handler = model
)

{'analysis': "The user query 'what is tabular chain of thought' can be rephrased as 'Tabular Chain of Thought: A method for explicit reasoning in tabular formats'. The query breaks down into multi-aspects: 'Tabular', 'Chain of Thought', and 'explicit reasoning'. The scientific keywords 'Tabular CoT' and 'explicit reasoning' are present in the paper title. The topic of the query is related to Natural Language Processing (NLP) and Reasoning. The context of the query is inquiring about a specific method or approach. The paper title and abstract are aligned with the query as they discuss a novel method 'Tab-CoT' for explicit reasoning in tabular formats.",
 'justification': "The query is closely related to the paper title and abstract as they discuss a novel method 'Tab-CoT' for explicit reasoning in tabular formats. The query is not ambiguous and directly inquires about the topic of the paper. The alignment between the query and the paper is high due to the presence of similar keywords an

In [12]:

tokens = model.get_stream_response(
    query='hi',
    system = " ",
    temperature = .5,
    max_tokens = 100,
)
async for token in tokens:
  print(token)



How
How
 are
 are
 you
 you
 today
 today
?
?


stop
How are you today?


# **RAG**

In [13]:
import google.generativeai as genai
MODEL_ID = 'models/gemini-1.5-flash'
GEMINI_KEY = "AIzaSyBGi3Z_7Ns_DtSAG_LWJwe2C1YkXaMG7AI"
GEMINI_BASE_URL = 'https://generativelanguage.googleapis.com/v1beta/openai/'

In [14]:
model = GenResponse(api_key=GEMINI_KEY,base_url = GEMINI_BASE_URL)
tokens = model.get_stream_response(
    query='hi',
    system = " ",
    model_id = 'gemini-1.5-flash',
    temperature = .5,
    max_tokens = 100,
)
async for token in tokens:
  print(token)

Hi
Hi
 there! How can I help you today?

 there! How can I help you today?



In [15]:
genai.configure(api_key=GEMINI_KEY)
def embedding_model(user_message : str) -> List[float]:
  embedding = genai.embed_content(
        model="models/text-embedding-004",
        content=user_message
        )
  return embedding['embedding']


In [16]:
import chromadb
import pandas as pd

max_results = 10
train_data_path = 'train_data.xlsx'
chromadb_path = 'chromadb'

class RetrievalPipeline(object):
    """
    Manages all the fuction related to retrival and vector database
    """
    class MyEmbeddingFunction(chromadb.EmbeddingFunction):
        """
        This is a custom function that generates embeddings for text data using the given model.
        """
        def __call__(self, Docs: chromadb.Documents) -> chromadb.Embeddings:
            """
            This function generates embeddings for a list of text documents using the given model.
            Args:
                Docs (chromadb.Documents): A list of text documents.
            Returns:
                chromadb.Embeddings: A list of embeddings (numerical representations) for the input text documents.
            """
            embeddings = [embedding_model(chunk) for chunk in Docs]
            return embeddings

    def __init__(
            self,
            chromadb_path:str = chromadb_path,
            train_data_path:str = train_data_path,
            collection_name : str = 'contexts'
        ) -> None:

        self.client = chromadb.PersistentClient(path=chromadb_path)
        self.training_data_path = train_data_path
        self.collection_name = collection_name

    def train(self, train:bool = False) -> None:
        if train or not self.client.list_collections():

            self.client.get_settings().allow_reset=True

            self.client.reset()
            print("All the collections has been removed")

            excel_data = pd.read_excel(self.training_data_path)
            print(excel_data.shape)

            print(f"Starting training for {self.collection_name}")

            collection = self.client.create_collection(
                name= self.collection_name,
                embedding_function=self.MyEmbeddingFunction(),
                metadata={"hnsw:space": "cosine"}
                )
            print("Collection has been created")
            collection.add(
                documents=excel_data['chunks'].to_list(),
                ids=excel_data.index.astype(str).to_list()

                )
            print("Data has been loaded succesfully")



    async def retrieve_chunks(
            self,
            user_message : str,
            )->Tuple[str,Dict]:
        """
        """
        try:
            collection_name = self.collection_name

            vectordb = self.client.get_collection(collection_name,embedding_function=self.MyEmbeddingFunction())
            results = vectordb.query(query_texts = user_message, n_results = max_results)

            chunks = results['documents'][0]
            return chunks

        except Exception as e:
          error = await handle_exception(e)
          return error

In [17]:
async def generative_prompt(all_chunks : List[str],system :str) -> str:
    """This function will return final system_prompt for bot,which can be directly used without any modification.

    Args:
        all_chunks (List[str]): List of chunks retrieved from vector database

    Returns:
        str: Final System Bot Prompt
    """
    if not isinstance(all_chunks,list):
        raise ValueError("all_chunks must be a string")

    if len(all_chunks)>0:
        knowledge_source = ''
        for i,doc in enumerate(all_chunks):
            knowledge_source += f"content_{i}: {doc} \n\n"
    else:
        knowledge_source = "NO DATA"

    system_prompt = system.format(knowledge_source = knowledge_source)
    return system_prompt

In [19]:
EXPAND_QUERY_PROMPT = """You are expert in contextual query rephraser for better similarity search retrieval for research paper content chatbot.

Task
- Break current query in different segments if user is looking for more topic.
- Rephrase and complete the current query by generating alternative versions using previous conversation to make complete contextual query .
- Alternate queries should be related to current query if it is not completely different.
- The alternative queries should not change the actual semantic meaning of current user query.
- Alternate queries should not be more than 2.

Inputs
prev_conversation:
{prev_conv}

current query: {query}

Return a json response with a single key `rephrased_query` ,value as a list of generated alternate query as string-
{{
    "rephrased_query" :List[str] (list of alternative queries.)
}}
You can not return anything apart from List of generated queries which should be parsed by python.
"""

In [20]:
async def get_expanded_query(
    user_message: str,
    chat_history: List[Dict],
    llm_handler : GenResponse
) -> str:
    """
    Asynchronously generates an expanded query based on the user message and chat history.

    It constructs a prompt using the provided parameters, sends it to the LLM, and processes the response to
    generate an expanded query. If the LLM fails to generate a valid response, the original user
    message is returned as the fallback.

    Args:
        user_message (str): The user's input message.
        chat_history (List[Dict]): The history of the conversation as a list of dictionaries.

    Returns:
        str: The expanded query in the format ``Ques : expanded_query``. If the LLM fails or
             returns an invalid response, the original user message is returned as the fallback.

    Raises:
        Any exceptions raised by the LLM or JSON parsing are caught and logged, but the function
        does not raise them further. Instead, it returns an empty string or the original message.
    """
    try:
        # Construct the prompt for query expansion

        chats = ''
        for chat in chat_history:
          if chat['role'] == 'user':
            chats += f"user: {chat['content']}\n"
          elif chat['role'] == 'assistant':
            chats += f"Bot: {chat['content']}\n"
        prompt = EXPAND_QUERY_PROMPT.format(
            prev_conv = chats,
            query = user_message
        )

        # Call the LLM to generate the expanded query
        response_text = await llm_handler.get_response(
            system = " ",
            model_id = 'gemini-1.5-flash',
            query = prompt,
            chat_history = [],
            temperature = .5,
            max_tokens = 300,
            response_format = {"type": "json_object"}
        )
        # Process the rephrased query
        if not isinstance(response_text,Dict):
          # Parse the LLM response
          response_text = await llm_handler.get_json(response_text.choices[0].message.content)
          response_text = response_text['rephrased_query']
          print(response_text)
          if isinstance(response_text, list):
            # If the response is a non-empty list, use the first item
            if response_text:
                  user_mes_expanded = response_text[0]
            else:
                  user_mes_expanded = user_message
          else:
                # If the response is a string, use it directly
                user_mes_expanded = response_text
        else:
            # If no rephrased query is returned, use the original message
            user_mes_expanded = user_message

    except Exception as e:
      error = await handle_exception(e)
      print(error)
      # Fallback to an empty string if an exception occurs
      user_mes_expanded = ""

    print("Query Expanded:\n", user_mes_expanded)

    return user_mes_expanded

In [44]:
# like <p>This is a statement with a reference<sup><a href="#ref1">[1]</a></sup>.</p>
bot_prompt = """-Role--
You are a question-answering chatbot for research paper. Your job is to answer user queries strictly using the content provided in the delimited by <ctx></ctx>). You cannot use any external knowledge or information outside the provided context.

<ctx>
{knowledge_source}
</ctx>

--Response--
1. Always and only respond in HTML tag formatted which should render on website smoothly. Do not use markdown, plain text, or any non-HTML formatting. \
Use semantic HTML tags like <h1> to <h6> for headings, <p> for paragraphs, <ul> and <li> for lists, <strong> for bold, <em> for italics, \
<mark> for highlights, <code> for inline code, <pre> for code blocks, <a> for links, and <table> for tabular data. \
Ensure the HTML is well-formed, valid, and self-contained (no external CSS or JavaScript)
2. Cite relevant chunks from the context using ^[Context_n] notation. For multiple citations, use ^[Context_1] ^[Context_2], etc.
3. Respond in customizable response lengths:
   - Concise
   - Medium
   - Detailed
4. Respond in customizable Format:
  - use <ul> and <li> for bullets points and <p> for paragraph
5. For questions unrelated to the given context, return : "Sorry I dont have information about this,Please ask related to your research paper only." or politely deny and guide the user back to the given context.

--Task--
1. Answer questions using only the content from the provided context. Do not rely on external knowledge.
2. Keep responses concise, crisp, and to the point unless the user requests a detailed answer.
3. Always cite relevant chunks from the context using ^[Context_n] notation.
4. If the user asks for a specific format (bullet points or paragraphs), adhere to their request.
"""

In [45]:
import asyncio

async def retrieval_with_query_expansion(
    user_message: str,
    prev_conversation: List[Dict],
    retrieval: RetrievalPipeline,
    llm_handler : GenResponse
) -> tuple:
    """
    Performs retrieval with query expansion to enhance search results.

    This function generates an expanded query using the user's message and conversation history,
    retrieves relevant chunks from a knowledge base using both the expanded and original queries,
    and constructs a system prompt based on the retrieved chunks.

    Args:
        user_message (str): The user's input message.
        prev_conversation (List[Dict]): The history of the conversation as a list of dictionaries.
        retrieval (RetrievalPipeline): The retrieval pipeline used to fetch chunks.

    Returns:
        tuple: A tuple containing:
            - all_chunks (List[str]): A list of unique retrieved chunks, limited to a maximum of 5.
            - expanded_query (str): The expanded query generated from the user message and chat history.
            - system_prompt (str): The final system prompt constructed from the retrieved chunks.

    Raises:
        Exception: If an error occurs during retrieval, query expansion, or prompt generation.
    """
    try:
        # Generate expanded query using the user message and conversation history
        expanded_query = await get_expanded_query(
            user_message=user_message,
            chat_history=prev_conversation,
            llm_handler = llm_handler
        )

        # Retrieve chunks using the expanded query if it is not empty
        if len(expanded_query)>0:
            retrieval_task1 = asyncio.create_task(
                retrieval.retrieve_chunks(
                    user_message=expanded_query,
                )
            )
        else:
            retrieval_task1 = None

        # Retrieve chunks using the original user message
        retrieval_task2 = asyncio.create_task(
            retrieval.retrieve_chunks(
                user_message=user_message,
            )
        )

        # Gather results from both retrieval tasks
        retrieval_results = await asyncio.gather(retrieval_task1, retrieval_task2)

        # Process results from the expanded query retrieval
        retrieved_chunks1 = []
        if retrieval_task1:
            semantic_search_result1 = retrieval_results[0]
            if 'error' not in semantic_search_result1:
                retrieved_chunks1 = semantic_search_result1

        # Process results from the original query retrieval
        semantic_search_result2 = retrieval_results[1]
        if 'error' not in semantic_search_result2:
            retrieved_chunks2 = semantic_search_result2
        else:
            retrieved_chunks2 = []

        # Combine and deduplicate chunks from both retrievals
        all_chunks = retrieved_chunks1.copy()
        for chunk in retrieved_chunks2:
            if chunk not in all_chunks:
                all_chunks.append(chunk)

        # Limit the number of chunks to 5
        if len(all_chunks) > max_results:
            all_chunks = all_chunks[:max_results]

        # Generate the final system prompt using the retrieved chunks
        system_prompt = await generative_prompt(all_chunks=all_chunks,system = bot_prompt)

        return all_chunks, expanded_query, system_prompt

    except Exception as e:
        error = await handle_exception(e)
        print(error)
        return error, error, error

## Chunks
   1. **Document based chunking** which will be beeter for research paper chunks to capture complete section content


In [36]:
%%capture
# !pip install -qU 'docling-core[chunking]' sentence-transformers transformers
!pip install docling
!pip install chonkie[all]

In [37]:
from docling.document_converter import DocumentConverter

DOC_SOURCE = "/content/SLLM.pdf"

doc = DocumentConverter().convert(source=DOC_SOURCE).document
text = doc.export_to_markdown()

In [25]:
print(doc.export_to_markdown())

## WHISMA: A SPEECH-LLM TO PERFORM ZERO-SHOT SPOKEN LANGUAGE UNDERSTANDING

Mohan Li, Cong-Thanh Do, Simon Keizer, Youmna Farag, Svetlana Stoyanchev, Rama Doddipatla

Cambridge Research Laboratory, Toshiba Europe Ltd, Cambridge, UK

## ABSTRACT

Speech large language models (speech-LLMs) integrate speech and text-based foundation models to provide a unified framework for handling a wide range of downstream tasks. In this paper, we introduce WHISMA, a speech-LLM tailored for spoken language understanding (SLU) that demonstrates robust performance in various zero-shot settings. WHISMA combines the speech encoder from Whisper with the Llama-3 LLM, and is fine-tuned in a parameter-efficient manner on a comprehensive collection of SLU-related datasets. Our experiments show that WHISMA significantly improves the zero-shot slot filling performance on the SLURP benchmark, achieving a relative gain of 26.6% compared to the current state-of-the-art model. Furthermore, to evaluate WHISMA's genera

In [26]:
text = doc.export_to_markdown()

In [27]:
# from chonkie import SemanticChunker

# # Basic initialization with default parameters
# chunker = SemanticChunker(
#     embedding_model="minishlab/potion-base-8M",  # Default model
#     threshold=0.5,                               # Similarity threshold (0-1) or (1-100) or "auto"
#     chunk_size=512,                              # Maximum tokens per chunk
#     min_sentences=1                              # Initial sentences per chunk
# )
# chunks = chunker.chunk(text)

# for chunk in chunks:
#     print(f"Chunk text: {chunk.text}")
#     print(f"Token count: {chunk.token_count}")
#     print(f"Number of sentences: {len(chunk.sentences)}")

In [28]:
from chonkie import LateChunker
import pandas as pd

def process_and_save_chunks(text: str, df_path: str) -> pd.DataFrame:
    """
    Processes the input text using LateChunker, saves the chunks into a DataFrame,
    and exports the DataFrame to an Excel file.

    Args:
        text (str): The input text to be processed.
        df_path (str): The file path where the DataFrame will be saved as an Excel file.

    Returns:
        pd.DataFrame: A DataFrame containing the chunks in the 'chunks' column.

    Raises:
        Exception: If an error occurs during processing or saving.
    """
    try:
        # Initialize the LateChunker
        chunker = LateChunker(
            embedding_model="all-MiniLM-L6-v2",
            mode="sentence",
            chunk_size=512,
            min_sentences_per_chunk=1,
            min_characters_per_sentence=12,
            delim=['\\n', '##']
        )

        # Generate chunks
        chunks = chunker(text)

        # Create a DataFrame
        df = pd.DataFrame({"chunks": chunks})

        # Save the DataFrame
        df.to_excel(df_path, index=False)

        print(f"DataFrame saved successfully at {df_path}")
        return df

    except Exception as e:
        error = handle_exception(e)
        print(error)
        return error
process_and_save_chunks(text, "train_data.xlsx")

Token indices sequence length is longer than the specified maximum sequence length for this model (307 > 256). Running this sequence through the model will result in indexing errors


DataFrame saved successfully at train_data.xlsx


Unnamed: 0,chunks
0,"(#, #, , W, H, I, S, M, A, :, , A, , S, P, ..."
1,"( , 1, ., , I, N, T, R, O, D, U, C, T, I, O, ..."
2,"( , 2, ., , R, E, L, A, T, E, D, , W, O, R, ..."
3,"( , 3, ., , M, E, T, H, O, D, \n, \n, I, n, ..."
4,"( , 3, ., 1, ., , M, o, d, e, l, , a, r, c, ..."
5,"( , 3, ., 2, ., , T, r, a, i, n, i, n, g, , ..."
6,"( , 3, ., 3, ., , T, r, a, i, n, i, n, g, , ..."
7,"( , 4, ., , E, X, P, E, R, I, M, E, N, T, S, ..."
8,"( , 4, ., 2, ., , E, v, a, l, u, a, t, i, o, ..."
9,"( , 4, ., 3, ., , M, a, i, n, , r, e, s, u, ..."


## Generation

In [29]:
model = GenResponse(api_key=GEMINI_KEY,base_url = GEMINI_BASE_URL)

In [30]:
retrieval = RetrievalPipeline(
    chromadb_path = 'chromadb',
    train_data_path = 'train_data.xlsx',
    collection_name = 'contexts'
)

retrieval.train(train = True)

All the collections has been removed
(12, 1)
Starting training for contexts
Collection has been created
Data has been loaded succesfully


In [31]:
await get_expanded_query(
    user_message = 'what is tabular chain of thought',
    chat_history = [],
    llm_handler = model
)

['Explain the concept of tabular chain of thought reasoning.', 'What are the characteristics and applications of tabular chain of thought?']
Query Expanded:
 Explain the concept of tabular chain of thought reasoning.


'Explain the concept of tabular chain of thought reasoning.'

In [46]:
chunks,rephrased_query,system = await retrieval_with_query_expansion(
    user_message = 'what is tabular chain of thought',
    prev_conversation = [],
    retrieval = retrieval,
    llm_handler = model
)
print(system)

['Explain the concept of tabular chain of thought reasoning.', 'What are tabular chain of thought methods and how do they work?']
Query Expanded:
 Explain the concept of tabular chain of thought reasoning.
-Role--
You are a question-answering chatbot for research paper. Your job is to answer user queries strictly using the content provided in the delimited by <ctx></ctx>). You cannot use any external knowledge or information outside the provided context.

<ctx>
content_0:  3.3. Training strategy

The training examples are organised according to Llama-3's standard prompt template, as outlined in Fig. 1. To enhance the robustness of WHISMA in handling diverse instructions during inference, we devise 10 distinct prompts for each task in ASR, IC, and SF. These prompts are randomly selected for each training example during fine-tuning. For the remaining tasks, we directly employ the provided question or instruction from the data as the text prompt.

Unlike ZS-Whisper-SLU [25], which tackles

In [33]:
await model.get_response(
    system = system,
    model_id = 'gemini-1.5-flash',
    query = 'what is whisma, in detail,bullets points',
    chat_history = [],
    temperature = .5,
    max_tokens = 2000
)

ChatCompletion(id=None, choices=[Choice(finish_reason=None, index=0, logprobs=None, message=ChatCompletionMessage(content='```html\n<h1>WHISMA: A Detailed Overview</h1>\n<ul>\n  <li><strong>What it is:</strong> WHISMA is a speech-LLM (speech large language model) designed to enhance zero-shot spoken language understanding (SLU) performance across various domains.^[content_1] ^[content_6]</li>\n  <li><strong>Architecture:</strong> It combines the Whisper large-v2 model as the speech encoder and the Llama-3 8B-Instruct model as the text decoder.  A trainable modality aligner connects these components. Low-rank adaptation (LoRA) is used on Llama-3 to handle speech modality inputs.^[content_4]</li>\n  <li><strong>Training:</strong> WHISMA is fine-tuned using approximately 2000 hours of speech data covering ASR, IC, SF, SQA, and SQIT/SIT tasks.  A training strategy incorporates an auxiliary ASR step before SLU via speech chain-of-thought (SCoT) or multi-round (MR) inference, maintaining its

In [47]:
model = GenResponse(api_key=GEMINI_KEY,base_url = GEMINI_BASE_URL)
chat_history = []

async def chat_loop():
    while True:
        # Get user input
        user_message = input("You: ")
        if user_message.lower() in ["exit", "quit"]:
            print("Exiting chat...")
            break

        # Add user message to chat history
        chat_history.append({"role": "user", "content": user_message})

        # Run retrieval and query expansion pipeline
        chunks, rephrased_query, system = await retrieval_with_query_expansion(
            user_message=user_message,
            prev_conversation=chat_history,
            retrieval = retrieval,
            llm_handler=model
        )
        print(f"System Prompt: {system}")

        # Get response from the model
        response = await model.get_response(
            system=system,
            model_id='gemini-1.5-flash',
            query=user_message,
            chat_history=chat_history,
            temperature=0.5,
            max_tokens=2000
        )
        if isinstance(response,Dict):
          raise(response)
        response = response.choices[0].message.content.replace("```html", "").replace("```", "")
        # Add assistant response to chat history
        chat_history.append({"role": "assistant", "content": response})

        # Print the response
        print(f"You: {user_message}")
        print(f"Assistant: {response}")

# Run the chat loop in the notebook
await chat_loop()

You: Explain Intro section in details list all the key points ,suggest something
['Provide a detailed explanation of the introduction section, including key points and suggestions for improvement.', 'Explain the introduction section in detail, listing all key points and offering recommendations.']
Query Expanded:
 Provide a detailed explanation of the introduction section, including key points and suggestions for improvement.
System Prompt: -Role--
You are a question-answering chatbot for research paper. Your job is to answer user queries strictly using the content provided in the delimited by <ctx></ctx>). You cannot use any external knowledge or information outside the provided context.

<ctx>
content_0:  1. INTRODUCTION

Traditional speech processing techniques typically depend on specialised models tailored to individual tasks. These models, trained with limited data and constrained architectures, often face difficulties in generalising to new domains and applications. However, rec

In [None]:

# await retrieval.retrieve_chunks('what is tabular chain of thought')

# **Problem 3: Tool Selection System**

Model Requirement: gpt-4o-Mini (OpenAI key will be provided)
Task: Develop a tool selection system based on Typeset.io's toolkit.

Instructions:

● Use tools listed on our website https://typeset.io left sidebar

● For every user query, LLM should select one of the tools from the above list.

● Each tool should also have its own parameters that the LLM will select from the user query.

● You can decide the parameters required for each tool.

● Create appropriate tool descriptions and use cases

● Handle edge cases and ambiguous queries

E.g.

User query: I want to find topics related to RLHF for LLM finetuning.

Output: Tool: Topic Finder

Params: Search query: RLHF for LLM finetuning

In [None]:
def interact_with_pdf(pdf_file_url: str = None,
                      operation: str = None,
                      query: str = None,
                      section: str = None,
                      highlighted_text: str = None,
                      language: str = "English",
                      note: str = None,
                      output_format: str = "plain_text",
                      citation_style: str = "APA",
                      related_paper_limit: int = 5) -> dict:
    """
    Interact with a PDF file to perform various operations such as answering questions,
    summarizing sections, explaining highlighted text, recommending related papers,
    or taking notes.

    Parameters:
    -----------
    pdf_file_url : str
        The URL or file path to the PDF. This is required for all operations.
    operation : str
        The operation to perform on the PDF. Options include:
        - "get_citation_answers": Get answers backed by citations.
        - "get_summary": Provide a section-wise or overall summary.
        - "highlight_explanation": Simplify complex highlighted text.
        - "get_related_papers": Recommend papers related to highlighted text.
        - "take_notes": Save notes for future reference.
    query : str, optional
        A specific question or search query to extract information. Required for "get_citation_answers".
    section : str, optional
        Target a specific section of the PDF, e.g., "Introduction" or "Conclusion". Default is the entire document.
    highlighted_text : str, optional
        Specific text from the PDF for explanations or related paper recommendations.
        Required for "highlight_explanation" and "get_related_papers".
    language : str, optional
        The language for the response. Default is "English". Supports 75+ languages.
    note : str, optional
        Text to save as a note in the "take_notes" operation.
    output_format : str, optional
        The format of the output. Options: "plain_text", "json", "markdown". Default is "plain_text".
    citation_style : str, optional
        Citation style for answers. Options: "APA", "MLA", "Chicago", "Harvard". Default is "APA".
    related_paper_limit : int, optional
        Maximum number of related papers to recommend. Default is 5.

    Returns:
    --------
    dict
        A dictionary containing the result of the requested operation. Keys and values depend on the operation.
        For example:
        - "get_citation_answers": { "answers": [...], "citations": [...] }
        - "get_summary": { "summary": { "Introduction": "...", "Conclusion": "..." } }
        - "highlight_explanation": { "simplified_text": "..." }
        - "get_related_papers": { "papers": [...] }
        - "take_notes": { "status": "Note saved successfully." }

    Raises:
    -------
    ValueError:
        If required parameters for the chosen operation are missing or invalid.

    """
    pass

tool1 = {
  "type": "function",
  "function": {
    "name": "chat_with_pdf",
    "description": """This tool is for Interact with a PDF file to perform various operations such as answering questions, summarizing sections, explaining highlighted text, recommending related papers, or taking notes.""",
    "parameters": {
      "type": "object",
      "properties": {
        "pdf_file_url": {
          "type": "string",
          "description": "The URL or path to the PDF file. Required to perform any operation on the PDF."
        },
        "query": {
          "type": "string",
          "description": "A question or search query to find specific answers or sections in the PDF. Can be left blank if performing a general operation."
        },
        "operation": {
          "type": "string",
          "enum": ["get_citation_answers", "get_summary", "highlight_explanation", "get_related_papers", "take_notes"],
          "description": "The specific operation to perform on the PDF, such as answering questions, summarizing, or finding related papers."
        },
        "section": {
          "type": "string",
          "description": "The specific section of the PDF to target (e.g., 'Introduction', 'Methods', 'Conclusion'). Leave blank for the entire document."
        },
        "highlighted_text": {
          "type": "string",
          "description": "Specific text from the PDF to get a simplified explanation or find related papers. Required for 'highlight_explanation' and 'get_related_papers' operations."
        },
        "language": {
          "type": "string",
          "description": "The language in which the response should be provided. Default is 'English'. Supports 75+ languages.",
          "default": "English"
        },
        "note": {
          "type": "string",
          "description": "Text for the note to be added. Used in the 'take_notes' operation to store custom annotations."
        },
        "output_format": {
          "type": "string",
          "enum": ["plain_text", "json", "markdown"],
          "description": "The desired format for the output of the operation.",
          "default": "plain_text"
        },
        "citation_style": {
          "type": "string",
          "enum": ["APA", "MLA", "Chicago", "Harvard"],
          "description": "The citation style for answers backed by citations. Used in 'get_citation_answers' operation.",
          "default": "APA"
        },
        "related_paper_limit": {
          "type": "integer",
          "description": "The maximum number of related papers to fetch. Only used in 'get_related_papers' operation.",
          "default": 5
        }
      },
      # "required": ["pdf_file_url"]
    }
  },
  "strict" :True
}
# tool1

In [None]:
def ai_writer_tool(operation: str, query: str = None, citation_source: str = None, note_content: str = None,
                   file_format: str = None, output_format: str = "text", language: str = "English") -> str:
    """
    AIWriter Tool: An intelligent assistant for writing research papers within typeset notebook with features like citation discovery, text autocompletion,
    note management, and paper export, ensuring a seamless academic writing experience.

    Parameters:
    ----------
    operation : str
        The type of operation to perform. Supported operations include:
        - 'find_citations': Discover and add citations from a vast database of 280M+ research papers.
        - 'autocomplete': Get intelligent suggestions to complete your writing.
        - 'save_notes': Save notes within the tool's ecosystem to organize your thoughts.
        - 'export_paper': Export your finished research paper with formatting intact.

    query : str, optional
        The specific query for the operation. For 'find_citations' or 'autocomplete', provide a topic, phrase, or context
        related to your work. This parameter is required for these operations.

    citation_source : str, optional
        The database or source to use for finding citations. Defaults to SciSpace if not specified.

    note_content : str, optional
        The content of the note to save. This parameter is required for the 'save_notes' operation.

    file_format : str, optional
        The desired export format for the research paper. Required for the 'export_paper' operation. Supported formats:
        - 'PDF'
        - 'DOCX'

    output_format : str, optional, default = "text"
        The format for the tool's response output. Supported formats include:
        - 'text': Plain text (default)
        - 'json': JSON structure
        - 'markdown': Markdown-formatted output

    language : str, optional, default = "English"
        The language for autocompletion suggestions or saved notes. Default is English, but supports a wide range of languages.

    Returns:
    -------
    str
        A response based on the operation performed. This may include citations, autocompleted text, confirmation of saved notes,
        or a link to the exported paper.

    """
    pass
tool2 = {
    "type": "function",
    "function": {
        "name": "ai_writer",
        "description": """An AI-powered tool to assist in writing research papers with confidence by providing citation discovery, text autocompletion, note management, and export functionality.
User's can create new notebook, edit old and do a lot of ai powered assistant within notebook in realtime.""",
        "parameters": {
            "type": "object",
            "properties": {
                "operation": {
                    "type": "string",
                    "description": "The type of operation to perform. Options include 'find_citations', 'autocomplete', 'save_notes', 'export_paper'.",
                    "enum": ["find_citations", "autocomplete", "save_notes", "export_paper"]
                },
                "query": {
                    "type": "string",
                    "description": "The specific query for citation discovery or text completion. For example, a topic, phrase, or context for writing."
                },
                "citation_source": {
                    "type": "string",
                    "description": "Optional. A specific citation database or source for finding references. If omitted, the default SciSpace database is used.",
                    "default": "SciSpace"
                },
                "note_content": {
                    "type": "string",
                    "description": "Optional. Content of the note to save. Required for 'save_notes' operation.",
                },
                "file_format": {
                    "type": "string",
                    "description": "Optional. Format to export the paper in. Required for 'export_paper' operation. Supported formats: 'PDF', 'DOCX'.",
                    "enum": ["PDF", "DOCX"]
                },
                "output_format": {
                    "type": "string",
                    "description": "Optional. Format of the response output. Options include 'text', 'json', 'markdown'. Default is 'text'.",
                    "enum": ["text", "json", "markdown"],
                    "default": "text"
                },
                "language": {
                    "type": "string",
                    "description": "Optional. Language for autocomplete suggestions or saved notes. Default is English.",
                    "default": "English"
                }
            },
            "required": ["operation"]
        }
    }
}


In [None]:
def literature_review_tool(
    query: str,
    review_type: str,
    filters: dict = None,
    custom_columns: list = None,
    output_format: str = "text",
    language: str = "English"
) -> str:
    """
    Literature Review Tool: A powerful assistant for discovering and reviewing research papers using AI. Provides semantic similarity-based answers, concise reviews, and customizable features.

    Parameters:
    ----------
    query : str
        The research topic or keywords to find relevant papers.

    language : str, optional, default = "English"
        The language of the review output. Default is English.

    Returns:
    """
    pass

tool3 = {
    "type": "function",
    "function": {
        "name": "literature_review",
        "description": "Discover new research papers and perform a quick AI-powered literature survey with semantic similarity, concise reviews, and customizable features in your own language.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The research topic or keywords or concise search query for discovering and reviewing relevant papers."
                },
                "language": {
                    "type": "string",
                    "description": "Language for the review output. Default is English.",
                    "default": "English"
                }
            },
            "required": ["query"]
        }
    }
}

In [None]:
def find_topics_tool(
    query: str,
    topic_type: str,
    export_format: str = "None",
    language: str = "English",
    source_inclusion: bool = True
) -> str:
    """
    Find Topics Tool: An AI-powered tool to dive deeper into research papers and extract insightful topics with grounded answers and explanations.

    Parameters:
    ----------
    query : str
        The research topic or keywords for extracting and summarizing related topics.

    language : str, optional, default = "English"
        The language for the output. Default is English.

    source_inclusion : bool, optional, default = True
        Whether to include sources for the topics in the output.

    Returns:
    -------
    str
        Extracted topics or summaries in the specified format.

    """
    pass
tool4 = {
    "type": "function",
    "function": {
        "name": "find_topics",
        "description": "Extract insightful topics from research papers and get grounded, summarized answers for top semantically similar topics in multiple languages.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The research topic or keywords to extract and summarize related topics."
                },
            },
            "required": ["query"]
        }
    }
}


In [None]:
def extract_data_tool(
    query: str,
    export_format: str = "None",
    language: str = "English",
    source_inclusion: bool = True,
    papers: list = None
) -> str:
    """
    Extract Data Tool: Extract summaries, conclusions, and findings from multiple research papers and provide them in a structured format.

    Parameters:
    ----------
    query : str
        The research topic or keywords for extracting data from papers.

    export_format : str, optional, default = "None"
        The format for exporting the results (e.g., "CSV", "Excel", "RIS", etc.).

    language : str, optional, default = "English"
        The language for the output. Default is English.

    source_inclusion : bool, optional, default = True
        Whether to include citations and sources for the data in the output.

    papers : list, optional, default = None
        A list of PDF papers to extract data from.

    Returns:
    -------
    str
        The extracted data or an exported file (depending on the format).

    """
    pass
tool5 = {
    "type": "function",
    "function": {
        "name": "extract_data",
        "description": "This Tool Extract summaries, conclusions, and findings from multiple research papers and provide them in a structured format. Features :Semantic Search, Extract & Compare information,Citation-backed insights\
        multiple Language support, Paper Summary, Export in multiple formats",
        "parameters": {
            "type": "object",
            "properties": {
                "files": {
                    "type": "list",
                    "description": "Optional,The list of files url path"
                },
            }
        }
    }
}


In [None]:
def paraphraser_tool(
    input_text: str,
    tone: str,
    language: str = "English",
    include_original: bool = False
) -> str:
    """
    Paraphraser Tool: Makes academic writing clear and original by paraphrasing input text in various tones or personas.

    Parameters:
    ----------
    input_text : str
        The text to be paraphrased.

    tone : str
        The desired tone or persona for paraphrasing, such as 'Academic', 'Fluent', 'Formal', 'Creative', etc.

    Returns:
    -------
    str
        The paraphrased text, optionally including the original text for comparison.
    """
    pass
tool6 = {
    "type": "function",
    "function": {
        "name": "paraphraser",
        "description": "Paraphrase input text into different tones or personas with grammatical correctness in multiple languages.",
        "parameters": {
            "type": "object",
            "properties": {
                "input_text": {
                    "type": "string",
                    "description": "The text to be paraphrased."
                },
                "tone": {
                    "type": "string",
                    "description": "The tone or persona for paraphrasing, such as 'Academic', 'Fluent', 'Formal', 'Creative', etc."
                }
            },
        }
    }
}


In [None]:
def citation_generator_tool(
    input_data: str,
    citation_style: str,
    export_format: str = "BibTeX"
) -> str:
    """
    Citation Generator Tool: Generate citations in various formats from a title or URL and export them.

    Parameters:
    ----------
    input_data : str
        The title or URL of the source to generate the citation.

    citation_style : str
        The desired citation format, e.g., 'APA', 'MLA', or any supported style.

    export_format : str, optional, default = "BibTeX"
        The format for exporting the citation, e.g., 'BibTeX', 'RIS', or 'Plain Text'.

    Returns:
    -------
    str
        The generated citation in the specified format.
    """
    pass

tool7  = {
    "type": "function",
    "function": {
        "name": "citation_generator",
        "description": "Generate citations in various formats (APA, MLA, and 2300+ styles) from a title or URL and export them in BibTeX format.",
        "parameters": {
            "type": "object",
            "properties": {
                "input_data": {
                    "type": "string",
                    "description": "The title or URL of the source to generate the citation."
                },
            },
            "required": ["input_data"]
        }
    }
}


In [None]:
def academic_ai_detector_tool(
    input_data: str,
) -> dict:
    """
    Academic AI Detector Tool: Identify AI-generated content in scholarly documents or text input.

    Parameters:
    ----------
    input_data : str
        The input data to analyze for AI-generated content. Can be a URL to a PDF file or plain text.

    Returns:
    -------
    dict
        A dictionary with detection results, including likelihood scores for AI-generated content.
    """
    pass


tool8 =  {
    "type": "function",
    "function": {
        "name": "academic_ai_detector",
        "description": "Detect AI-generated content (e.g., GPT-4, ChatGPT, Jasper) in scholarly documents or text input.",
        "parameters": {
            "type": "object",
            "properties": {
                "input_data": {
                    "type": "string",
                    "description": "The input data to analyze for AI-generated content. Can be a URL to a PDF file or plain text."
                },
            },
        }
    }
}


In [None]:
def research_pdf_to_video(
    pdf_url: str,
) -> dict:
    """
    Research PDF to Video Tool: Converts research PDFs into engaging videos with features like voice-over, subtitles, and transitions.

    Parameters:
    ----------
    pdf_url : str
        The URL or file path of the research PDF to convert into a video.
    Returns:
    -------
    dict
        A dictionary containing the status of the conversion process and the download link for the generated video.
    """
    pass

tool9 = {
    "type": "function",
    "function": {
        "name": "pdf_to_video",
        "description": "Convert research PDFs into engaging videos with voice-over, subtitles, and transitions.",
        "parameters": {
            "type": "object",
            "properties": {
                "pdf_url": {
                    "type": "string",
                    "description": "The URL or file path of the research PDF to convert into a video."
                },

            },
            "required": ["pdf_url"]
        }
    }
}


In [None]:
# Tools
tools = [tool1,tool2,tool3,tool4,tool5,tool6,tool7,tool8,tool9]

tools_str = ''
for tool in tools:
  tools_str += f"title: {tool['function']['name']}\nDescription: {tool['function']['description']}\nparameters: {tool['function']['parameters']}\n\n"
print(tools_str)

title: chat_with_pdf
Description: This tool is for Interact with a PDF file to perform various operations such as answering questions, summarizing sections, explaining highlighted text, recommending related papers, or taking notes.
parameters: {'type': 'object', 'properties': {'pdf_file_url': {'type': 'string', 'description': 'The URL or path to the PDF file. Required to perform any operation on the PDF.'}, 'query': {'type': 'string', 'description': 'A question or search query to find specific answers or sections in the PDF. Can be left blank if performing a general operation.'}, 'operation': {'type': 'string', 'enum': ['get_citation_answers', 'get_summary', 'highlight_explanation', 'get_related_papers', 'take_notes'], 'description': 'The specific operation to perform on the PDF, such as answering questions, summarizing, or finding related papers.'}, 'section': {'type': 'string', 'description': "The specific section of the PDF to target (e.g., 'Introduction', 'Methods', 'Conclusion'). 

In [None]:
think_plan_tools_selection ="""You are a tool or action selection system for Typeset.\
Your role involves selecting single tool or action from given tools to the user query.
<tools>
{tools_all}
</tools>
Before Selecting any tool follow this strategy:
Think and planning:
  - Read user input carefully. Break user query into smaller smaller parts and predict intent.
  - Define Selection confidence scale :
       - Certain: If the tool title or description and input parameters (if any required) semantically closely matches up to 100% and there is no ambiguity with other tools.
       - High: If the tool title or description and input parameters (if any required) show a semantic resemblance and there is no ambiguity with other tools.
       - Low: If there is ambiguity with other tools.
       - very Low: If there is little to no match with any title,description.
  - Only select tools (maximum 3) whose selection confidence is High or Certain.
  - Finally select best tool from last 3 tools based on the selection confidence and user intent
  - If you could not selected any tool after applying selection confidence scale then ask further details fas tool cannot be confidently selected.
  - Always extract valid arguments value after removing spelling ,grammatical or any mistakes.
  - Do not return default arguments.

Always return your output in json format like this or empty json:
{{
  "tool":"tool title",
  "arguments":{{
    "argument_name":"valid argument_value extracted from user input",
    }}
}}
"""
think_plan_tools_selection = think_plan_tools_selection.format(tools_all=tools_str)
print(think_plan_tools_selection)

You are a tool or action selection system for Typeset.Your role involves selecting single tool or action from given tools to the user query.
<tools>
title: chat_with_pdf
Description: This tool is for Interact with a PDF file to perform various operations such as answering questions, summarizing sections, explaining highlighted text, recommending related papers, or taking notes.
parameters: {'type': 'object', 'properties': {'pdf_file_url': {'type': 'string', 'description': 'The URL or path to the PDF file. Required to perform any operation on the PDF.'}, 'query': {'type': 'string', 'description': 'A question or search query to find specific answers or sections in the PDF. Can be left blank if performing a general operation.'}, 'operation': {'type': 'string', 'enum': ['get_citation_answers', 'get_summary', 'highlight_explanation', 'get_related_papers', 'take_notes'], 'description': 'The specific operation to perform on the PDF, such as answering questions, summarizing, or finding related

In [None]:
OPENAI_KEY = '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'
tool_model = GenResponse(
    OPENAI_KEY,
    base_url = 'https://api.openai.com/v1',
    openai=True
)

In [None]:
queries = [
    # Get Citation-Backed Answers
    "What are the main conclusions of this study? [PDF: https://example.com/sample-paper1.pdf]",
    "What methods were used in the research? [PDF: https://example.com/sample-paper2.pdf, Citation Style: MLA]",
    "How does this research compare to similar studies? [PDF: https://example.com/sample-paper3.pdf, Output Format: JSON, Language: French]",

    # Get Summary
    "Summarize this paper section by section. [PDF: https://example.com/sample-paper4.pdf]",
    "Provide a summary of the Introduction section. [PDF: https://example.com/sample-paper5.pdf, Section: Introduction]",
    "Summarize the Conclusion section in Portuguese. [PDF: https://example.com/sample-paper6.pdf, Section: Conclusion, Language: Portuguese, Output Format: Markdown]",

    # Explain Highlighted Text
    "Explain this sentence: 'Quantum entanglement enables instantaneous state changes.' [PDF: https://example.com/sample-paper7.pdf]",
    "Explain this highlighted text: 'The Transformer architecture revolutionized natural language processing.' [PDF: https://example.com/sample-paper8.pdf, Language: Spanish]",
    "Explain: 'The integration of blockchain with IoT enhances security.' in markdown format. [PDF: https://example.com/sample-paper9.pdf, Output Format: Markdown, Language: Italian]",

    # Get Related Papers
    "Find related papers to this idea: 'Deep learning approaches in image recognition.' [PDF: https://example.com/sample-paper10.pdf]",
    "Recommend 3 papers related to 'Self-attention mechanism in neural networks.' [PDF: https://example.com/sample-paper11.pdf, Related Paper Limit: 3]",
    "Are there any related papers to 'Support vector machines in classification tasks' with no limit? [PDF: https://example.com/sample-paper12.pdf]",

    # Take Notes
    "Add this note: 'Important: Explore alternative algorithms for better performance.' [PDF: https://example.com/sample-paper13.pdf]",
    "Save this note: 'Consider the scalability issues mentioned in the discussion section.' [PDF: https://example.com/sample-paper14.pdf, Output Format: JSON]",
    "Add this note in Japanese: 'Review the statistical methods used in the analysis.' [PDF: https://example.com/sample-paper15.pdf, Language: Japanese, Output Format: JSON]",

    # Test for Invalid or Edge Cases
    "What future directions are proposed in the paper? [PDF: https://example.com/sample-paper16.pdf, Citation Style: Harvard]",
    "Summarize the Abstract section of this paper. [PDF: https://example.com/sample-paper17.pdf, Section: Abstract]",
    "Can you summarize this paper? [PDF: https://example.com/sample-paper18.pdf]",
    "Explain this complex sentence with a lot of technical terms. [PDF: https://example.com/sample-paper19.pdf]",
    "Perform an operation to answer this query: 'This should fail.' [PDF: https://example.com/sample-paper20.pdf, Operation: Invalid]",
    "Describe the experimental setup in this paper. [PDF: https://example.com/sample-paper21.pdf, Citation Style: Chicago, Output Format: JSON, Language: Russian]"
]

test_cases = [
    # Test cases for 'find_citations'
    "Find citations for 'Quantum computing advancements'. [operation: 'find_citations', query: 'Quantum computing advancements']",
    "Discover citations on 'Machine learning applications' from PubMed. [operation: 'find_citations', query: 'Machine learning applications', citation_source: 'PubMed']",
    "Find references for 'Impact of climate change'. [operation: 'find_citations', query: 'Impact of climate change', output_format: 'markdown']",

    # Test cases for 'autocomplete'
    "Autocomplete text for 'Deep learning techniques in...'. [operation: 'autocomplete', query: 'Deep learning techniques in...']",
    "Provide autocomplete suggestions for 'Artificial Intelligence in healthcare' in Spanish. [operation: 'autocomplete', query: 'Artificial Intelligence in healthcare', language: 'Spanish']",
    "Generate suggestions for 'Blockchain applications in supply chain' with JSON output. [operation: 'autocomplete', query: 'Blockchain applications in supply chain', output_format: 'json']",

    # Test cases for 'save_notes'
    "Save the note: 'Remember to include related works section.' [operation: 'save_notes', note_content: 'Remember to include related works section.']",
    "Save the note in German: 'Fügen Sie die Quellenangaben hinzu.' [operation: 'save_notes', note_content: 'Fügen Sie die Quellenangaben hinzu.', language: 'German']",
    "Save a note with JSON format: 'Discuss experimental limitations in detail.' [operation: 'save_notes', note_content: 'Discuss experimental limitations in detail.', output_format: 'json']",

    # Test cases for 'export_paper'
    "Export the paper in PDF format. [operation: 'export_paper', file_format: 'PDF']",
    "Export the paper in DOCX with Markdown output. [operation: 'export_paper', file_format: 'DOCX', output_format: 'markdown']",
    "Export the paper to check formatting. [operation: 'export_paper', file_format: 'PDF', output_format: 'json']",

    # Combination of arguments for multiple operations
    "Find citations for 'Neural networks in image processing' and export results in JSON. [operation: 'find_citations', query: 'Neural networks in image processing', output_format: 'json']",
    "Autocomplete 'Reinforcement learning techniques for...' and save the suggestion as a note. [operation: 'autocomplete', query: 'Reinforcement learning techniques for...', output_format: 'text', language: 'French']",
    "Generate citations and export paper in DOCX. [operation: 'find_citations', query: 'AI in education', citation_source: 'Google Scholar', file_format: 'DOCX']",

    # Edge cases
    "Invalid operation test. [operation: 'invalid_operation']",
    "Missing query for citations. [operation: 'find_citations']",
    "Missing note content for saving notes. [operation: 'save_notes']",
    "Export without file format. [operation: 'export_paper']",
    "Provide autocomplete suggestions in an unsupported language. [operation: 'autocomplete', query: 'AI ethics', language: 'Klingon']",
    "Save note with an empty string. [operation: 'save_notes', note_content: '']",
    "Find citations without specifying a database. [operation: 'find_citations', query: 'Effects of vaccination on global health']"
]

queries += test_cases
test_cases = [
    # Concise review
    "Generate a concise review on 'Artificial Intelligence in education'. [query: 'Artificial Intelligence in education', review_type: 'concise_review']",
    "Summarize similar papers on 'Climate change and renewable energy' from 2010 to 2023. [query: 'Climate change and renewable energy', review_type: 'concise_review', filters: {'publication_year': [2010, 2023]}]",
    "Provide a concise review of 'COVID-19 impact on mental health' in French. [query: 'COVID-19 impact on mental health', review_type: 'concise_review', language: 'French']",

    # Semantic similar
    "Find semantically similar papers for 'Advancements in battery technology'. [query: 'Advancements in battery technology', review_type: 'semantic_similar']",
    "Retrieve similar papers on 'Deep learning in NLP' from IEEE journals. [query: 'Deep learning in NLP', review_type: 'semantic_similar', filters: {'journal': 'IEEE'}]",
    "Get similar papers for 'Cancer genomics' authored by 'Dr. John Doe'. [query: 'Cancer genomics', review_type: 'semantic_similar', filters: {'author': 'Dr. John Doe'}]",

    # Custom columns
    "Create a review with custom columns 'Abstract' and 'Citations' for 'Blockchain applications'. [query: 'Blockchain applications', review_type: 'custom_columns', custom_columns: ['Abstract', 'Citations']]",
    "Add custom columns 'Keywords' and 'References' for papers on 'Neural networks in image recognition'. [query: 'Neural networks in image recognition', review_type: 'custom_columns', custom_columns: ['Keywords', 'References']]",
    "Include a custom column 'Highlights' for the topic 'Evolutionary algorithms'. [query: 'Evolutionary algorithms', review_type: 'custom_columns', custom_columns: ['Highlights'], output_format: 'markdown']",

    # Combination queries
    "Find similar papers for 'Robotics in healthcare' and create a concise review. [query: 'Robotics in healthcare', review_type: 'semantic_similar', filters: {'publication_year': [2015, 2023], 'language': 'English'}]",
    "Generate a concise review on 'AI ethics' and save it as JSON. [query: 'AI ethics', review_type: 'concise_review', output_format: 'json']",

    # Edge cases
    "Generate a concise review without a query. [review_type: 'concise_review']",
    "Request custom columns without specifying columns. [query: 'Machine learning in finance', review_type: 'custom_columns']",
    "Request semantic similar papers in an unsupported language. [query: 'Cryptography techniques', review_type: 'semantic_similar', filters: {'language': 'Elvish'}]",
    "Find concise reviews for 'Quantum entanglement' with invalid publication year. [query: 'Quantum entanglement', review_type: 'concise_review', filters: {'publication_year': [2025, 2020]}]"
]
queries += test_cases

test_cases = [
    # Summarized topics
    "Get summarized topics for 'Deep learning in healthcare'.",
    "Summarize topics for 'Climate change adaptation' in French.",
    "Find summarized topics for 'Blockchain in supply chain' without sources.",

    # Explained topics
    "Get explained topics for 'Quantum mechanics'.",
    "List topics for 'AI ethics' and export to CSV. ",
    "Provide explained topics on 'Renewable energy solutions' with sources. ",
    "Explain topics on 'Natural language processing' in Spanish. ",

    # Combination of arguments
    "Summarize topics on 'Genomics research' and export in JSON. ",
    "List topics on 'Robotics in manufacturing' in German and include sources.",

    # Edge cases
    "Request topics without specifying query. ",
    "Invalid topic type test.",
    "Export topics without specifying format. ",
    "Summarize topics for 'Artificial Intelligence' in an unsupported language. ",
    "Explain topics for 'Evolutionary biology' with incorrect source inclusion type. ",
    "Find topics without specifying topic type."
]
queries += test_cases

test_cases = [
    # Summarized data extraction
    "Extract summary for research papers on 'Deep learning in healthcare'.",
    "Get summary of conclusions from 'Climate change adaptation' research papers in French.",
    "Extract summarized findings for 'Blockchain in supply chain' without including sources.",

    # Data extraction with sources
    "Get conclusions from papers on 'Quantum mechanics' with sources included.",
    "Extract findings from research papers on 'AI ethics' and export to CSV.",
    "Provide conclusions and findings for 'Renewable energy solutions' with sources.",
    "Get summarized data from papers on 'Natural language processing' in Spanish.",

    # Combination of arguments
    "Summarize data for 'Genomics research' and export in JSON format.",
    "Extract findings for 'Robotics in manufacturing' in German with sources included.",

    # Edge cases
    "Request data extraction without specifying a query.",
    "Invalid export format specified for data extraction.",
    "Export data without specifying format.",
    "Request summary for 'Artificial Intelligence' in an unsupported language.",
    "Extract conclusions for 'Evolutionary biology' with incorrect source inclusion type.",
    "Extract data without specifying a topic or query."
]
queries += test_cases

test_cases = [
    # Paraphrasing in different tones
    "Paraphrase 'The results were significant and conclusive.' in Academic tone.",
    "Paraphrase 'This is an amazing breakthrough!' in Formal tone.",
    "Paraphrase 'We’re super excited about this!' in Fluent tone.",
    "Paraphrase 'She was sad about the incident.' in Creative tone.",

    # Paraphrasing in different languages
    "Paraphrase 'The weather is pleasant today.' in Academic tone in French.",
    "Paraphrase 'This is a revolutionary idea.' in Creative tone in Spanish.",
    "Paraphrase 'The meeting was productive.' in Formal tone in German.",

    # Including original text in output
    "Paraphrase 'The solution was effective and well-received.' in Fluent tone, include original text.",
    "Paraphrase 'The experiment yielded unexpected results.' in Academic tone, include original text.",

    # Edge cases
    "Paraphrase an empty string in Academic tone.",
    "Paraphrase 'This idea is groundbreaking!' without specifying a tone.",
    "Request paraphrasing in an unsupported language.",
    "Provide an invalid tone for paraphrasing.",
    "Paraphrase with include_original set to an incorrect type."
]
queries += test_cases
test_cases = [
    # Generate citation from title
    "Generate APA citation for 'Deep Learning in Healthcare' and export in BibTeX.",
    "Create MLA citation for 'The Origin of Species' and export in RIS.",
    "Generate Chicago citation for 'Artificial Intelligence and Society'.",
    "Produce Harvard citation for 'Climate Change Adaptation' and export in Plain Text.",

    # Generate citation from URL
    "Generate APA citation for 'https://arxiv.org/abs/1234.5678' and export in BibTeX.",
    "Create MLA citation for 'https://doi.org/10.1016/j.jbi.2021.04.009' and export in RIS.",
    "Produce Chicago citation for 'https://www.example.com/research-paper'.",
    "Generate Harvard citation for 'https://journals.sagepub.com/home/tcs' and export in Plain Text.",

    # Edge cases
    "Request citation for an invalid URL.",
    "Generate citation without specifying citation style.",
    "Request citation for a title that does not exist.",
    "Provide unsupported citation style for generation.",
    "Request export in an unsupported format.",
    "Generate citation for an empty title or URL."
]
queries += test_cases
test_cases = [
    # Detect AI content from text input
    "Detect AI content in text: 'This research explores the impact of AI on society'.",
    "Analyze text with low sensitivity: 'The results are generated by ChatGPT'.",
    "Detect AI content in multilingual text: 'Ce texte est généré par IA'.",
    "Analyze content with high sensitivity: 'Lorem ipsum AI generator test.'.",

    # Detect AI content from PDF URL
    "Analyze AI-generated content in 'https://example.com/scholarly_article.pdf'.",
    "Detect AI content in a PDF with high sensitivity: 'https://example.com/research.pdf'.",
    "Run detection on a scholarly paper PDF with medium sensitivity: 'https://example.com/document.pdf'.",

    # Edge cases
    "Run detection on empty input data.",
    "Provide invalid PDF URL for analysis.",
    "Analyze text with an unsupported sensitivity level.",
    "Run detection on mixed input (text + URL).",
    "Detect AI content in non-English text input.",
    "Request AI detection for corrupted or inaccessible PDF files."
]
queries += test_cases
test_cases = [
    # Basic conversions
    "Convert PDF 'https://example.com/research.pdf' to video with default settings.",
    "Generate video from 'https://example.com/thesis.pdf' with Spanish voice-over.",
    "Create video for 'https://example.com/article.pdf' without subtitles.",
    "Convert PDF 'https://example.com/review.pdf' with zoom transitions and output as AVI.",

    # Advanced configurations
    "Generate video from 'https://example.com/study.pdf' with French voice-over and slide transitions.",
    "Create a video for 'https://example.com/paper.pdf' in MP4 with no subtitles and English narration.",
    "Export video for 'https://example.com/research_paper.pdf' with MOV format and fade transitions.",

    # Edge cases
    "Handle invalid PDF URL input for video conversion.",
    "Request video generation with unsupported voice-over language.",
    "Attempt conversion with missing PDF URL input.",
    "Specify an unsupported output video format.",
    "Request video with unsupported transition style.",
    "Handle corrupted PDF input during conversion."
]
queries += test_cases

In [None]:
for query in queries:
  print(f"Query: {query}")
  tool_sel = await tool_model.get_response(
      system = think_plan_tools_selection,
      chat_history = [],
      query = query,
      model_id = 'gpt-4o-mini',
      temperature = 0.7,
      max_tokens = 2000,
      # tools = tools,
      response_format= {"type":"json_object"}
    )
  if isinstance(tool_sel,Dict):
    raise(tool_sel)
  else:
    if tool_sel.choices[0].finish_reason == 'tool_calls':
      print(tool_sel.choices[0])
      tools = tool_sel.choices[0].message.function_call
      print(f"params: {tools.arguments}")
      print(f"Tool: {tools.name}")
    print(tool_sel.choices[0].message.content)

Query: What are the main conclusions of this study? [PDF: https://example.com/sample-paper1.pdf]
{
  "tool":"extract_data",
  "arguments":{
    "files":["https://example.com/sample-paper1.pdf"]
  }
}
Query: What methods were used in the research? [PDF: https://example.com/sample-paper2.pdf, Citation Style: MLA]
{
  "tool":"chat_with_pdf",
  "arguments":{
    "pdf_file_url":"https://example.com/sample-paper2.pdf",
    "query":"What methods were used in the research?",
    "operation":"get_citation_answers",
    "citation_style":"MLA"
  }
}
Query: How does this research compare to similar studies? [PDF: https://example.com/sample-paper3.pdf, Output Format: JSON, Language: French]


CancelledError: 