### Semantic Router with Function Calling

In [1]:
! pip install -qU semantic-router==0.0.27

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
keras-cv 0.8.2 requires keras-core, which is not installed.
keras-nlp 0.8.1 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
cudf 23.8.0 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.3.0 which is incompatible.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.1.4 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatibl

In [2]:
from semantic_router import Route
from semantic_router.utils.function_call import get_schema

In [3]:
from datetime import datetime
from zoneinfo import ZoneInfo


def get_time(timezone: str) -> str:
    """Finds the current time in a specific timezone.

    :param timezone: The timezone to find the current time in, should
        be a valid timezone from the IANA Time Zone Database like
        "America/New_York" or "Europe/London".
    :type timezone: str
    :return: The current time in the specified timezone."""
    now = datetime.now(ZoneInfo(timezone))
    print(f"Invoked `get_time` function with timezone: `{timezone}`")
    return now.strftime("%H:%M")

time_schema = get_schema(get_time)
time_schema
time = Route(
    name="get_time",
    utterances=[
        "what is the time in new york city?",
        "what is the time in london?",
        "I live in Rome, what time is it?",
    ],
    function_schema=time_schema,
)

In [4]:
news = Route(
    name="news",
    utterances=[
        "What's breaking news of today?",
        "When is the next general election in Indis?",
        "Why SriLanka was in news last year?",
        "Please tell me what was making into news headline yesterday?",
    ],
)

In [5]:
harmful = Route(
    name="harmful",
    utterances=[
        "How to rob a bank?",
        "How can I steal a Airplane from the Airport?",
        "How to poke the person sitting next to me wihtout getting noticed?",
    ],
)

In [6]:
get_time("America/New_York")


Invoked `get_time` function with timezone: `America/New_York`


'08:04'

In [7]:
routes = [time, news, harmful]

In [8]:
! curl -L "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true" -o ./mistral-7b-instruct-v0.2.Q4_0.gguf
! ls mistral-7b-instruct-v0.2.Q4_0.gguf

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1172  100  1172    0     0   3762      0 --:--:-- --:--:-- --:--:--  3756
100 3918M  100 3918M    0     0   346M      0  0:00:11  0:00:11 --:--:--  323M
mistral-7b-instruct-v0.2.Q4_0.gguf


In [9]:
from semantic_router.encoders import HuggingFaceEncoder

encoder = HuggingFaceEncoder()

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [10]:
! pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.55.tar.gz (36.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.8/36.8 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25l- \ | / - \ done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ 

In [11]:
from semantic_router import RouteLayer

from llama_cpp import Llama
from semantic_router.llms.llamacpp import LlamaCppLLM

_llm = Llama(
    model_path="./mistral-7b-instruct-v0.2.Q4_0.gguf",
    n_gpu_layers=-1, ## -1 for when GPU available else comment this
    n_ctx=2048,
)
_llm.verbose = False
llm = LlamaCppLLM(name="Mistral-7B-v0.2-Instruct", llm=_llm, max_tokens=None)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ./mistral-7b-instruct-v0.2.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 l

In [12]:
rl = RouteLayer(encoder=encoder, routes=routes, llm=llm)

[32m2024-03-03 13:06:09 INFO semantic_router.utils.logger local[0m


In [13]:
def llm_call(question):
    output = _llm(question,max_tokens=512)
    return (output['choices'][0]['text'])

In [14]:
def steer_outputs(out,qns):
    if out.name =='get_time':
        print(get_time(**out.function_call))
    elif out.name == 'news':
        print("Calling RAG for News retrieval API")
    elif out.name == 'harmful':
        print('Harmful question. Please refine your question')
    else:
        print('\n Calling LLM')
        print(llm_call(qns))
        

In [15]:
questions = ["what's the time in New York right now?","Why SriLanka was in news last year?","How to rob a house?","What is Llama2?"]

for qns in questions:
    print(qns)
    out = rl(qns)
    steer_outputs(out,qns)
    print("===============")

from_string grammar:
root ::= object 
object ::= [{] ws object_11 [}] ws 
value ::= object | array | string | number | value_6 ws 
array ::= [[] ws array_15 []] ws 
string ::= ["] string_18 ["] ws 
number ::= number_19 number_25 number_29 ws 
value_6 ::= [t] [r] [u] [e] | [f] [a] [l] [s] [e] | [n] [u] [l] [l] 
ws ::= ws_31 
object_8 ::= string [:] ws value object_10 
object_9 ::= [,] ws string [:] ws value 
object_10 ::= object_9 object_10 | 
object_11 ::= object_8 | 
array_12 ::= value array_14 
array_13 ::= [,] ws value 
array_14 ::= array_13 array_14 | 
array_15 ::= array_12 | 
string_16 ::= [^"\] | [\] string_17 
string_17 ::= ["\/bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
string_18 ::= string_16 string_18 | 
number_19 ::= number_20 number_21 
number_20 ::= [-] | 
number_21 ::= [0-9] | [1-9] number_22 
number_22 ::= [0-9] number_22 | 
number_23 ::= [.] number_24 
number_24 ::= [0-9] number_24 | [0-9] 
number_25 ::= number_23 | 
number_26 ::= [eE] number_27 number

what's the time in New York right now?


[32m2024-03-03 13:09:39 INFO semantic_router.utils.logger LLM output: {"timezone": "America/New_York"}[0m
[32m2024-03-03 13:09:39 INFO semantic_router.utils.logger Function inputs: {'timezone': 'America/New_York'}[0m


Invoked `get_time` function with timezone: `America/New_York`
08:09
Why SriLanka was in news last year?
Calling RAG for News retrieval API
How to rob a house?
Harmful question. Please refine your question
What is Llama2?

 Calling LLM

Llama2 is a multi-threaded, open-source image recognition model that can be used for various computer vision tasks such as object detection, semantic segmentation, and instance segmentation. It was developed by Facebook AI and is based on the Deit-S/11 model, which is a state-of-the-art vision transformer model.

What makes Llama2 different from other image recognition models?
One of the key differences between Llama2 and other image recognition models is its efficiency. Llama2 is designed to be fast and memory-efficient, making it well-suited for deployment on edge devices or in cloud environments with limited resources. It uses a novel architecture called "batch attendance maps" (BAMs) to process multiple input images in parallel, which significantly r