In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import sys
sys.path.append('../')

from dotenv import load_dotenv, find_dotenv
envs = load_dotenv(find_dotenv(), override=True)

from litellm import batch_completion
from src.database.database_utils import get_weaviate_client
from src.llm.llm_interface import LLM
from src.llm.llm_utils import get_token_count, load_azure_openai
from src.llm.prompt_templates import huberman_system_prompt
from app_features import generate_prompt_series
import os
import re
import tiktoken
from tiktoken import Encoding
from rich import print
import asyncio
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True)

### Set Constants
---

### Weaviate

In [10]:
#get weaviate client

weave_client = get_weaviate_client()
collections = weave_client.show_all_collections()
print(collections)

  return self.fget.__get__(instance, owner)()


### LLM

In [11]:
turbo_model = "gpt-3.5-turbo-0125"
claude_model = "claude-3-haiku-20240307"
azure_model = "gpt-35-turbo" #"gpt-4-32k"

In [12]:
claude = LLM(model_name=claude_model, api_key=os.environ['ANTHROPIC_API_KEY'])
turbo = LLM(model_name=turbo_model)
azure = load_azure_openai(model_name=azure_model)

### Retrieval

In [13]:
query = 'What does Cal Newport have to say about avoiding distractions'

results = weave_client.hybrid_search(request=query,
                                     collection_name=collections[2],
                                     return_properties=['content', 'title', 'summary','guest'],
                                     limit=5
                                    )



### Prompt Engineering

In [14]:
assistant_message = generate_prompt_series(query, results[:5])
max_tokens = 500
token_count = get_token_count(assistant_message) + get_token_count(huberman_system_prompt) + max_tokens
print(f'Rough Total Token Count: {token_count}')
# print(assistant_message)

### LLM Call

In [103]:
%%time
token_count = 0

async def gather_tasks():
    tasks = [azure.achat_completion( system_message=huberman_system_prompt,
                                       assistant_message=assistant_message,
                                       temperature=1.0,
                                       max_tokens=max_tokens,
                                       raw_response=True
                                       ) for x in range(3)]
    responses = await asyncio.gather(*tasks)
    return responses

# for i in range(1,16):
#     completion = await 
#     tokens = completion.usage.total_tokens
#     token_count += tokens
#     print(f'{i}.) Running Token Count: {token_count}')
#     print(completion.choices[0].message.content)

CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 13.8 µs


In [104]:
import time
start = time.perf_counter()
responses = asyncio.run(gather_tasks())
end = time.perf_counter() - start
print(f'{round(end,2)} seconds')

In [106]:
%%time
message, cost = azure.chat_completion(huberman_system_prompt, assistant_message, temperature=1.0, return_cost=True)

CPU times: user 40.8 ms, sys: 4.01 ms, total: 44.9 ms
Wall time: 1.34 s


In [111]:
2800/1000 * 0.0005 + (500/1000*0.0015)

0.00215

In [108]:
print(cost)
print(message)

In [56]:
async def async_task(message: list[dict]):
    response = await acompletion(model="gpt-3.5-turbo-1106", messages=message, temperature=1.0)
    return response

In [64]:
%%time
async def gather(prompts: list[str]):
    tasks = []
    for p in prompts:
        messages = [{"role": "system", "content": "You are a highly experienced data annotator.  Your job is to create two questions that can be answered from the provided context."},
                    {"role": "assistant", "content": prompt.format(context=p)}]
        tasks.append(async_task(messages))
    asyncio.gather(*tasks)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 6.91 µs


In [65]:
%%time
responses = asyncio.run(gather(prompts))

CPU times: user 328 µs, sys: 109 µs, total: 437 µs
Wall time: 416 µs


In [63]:
responses

### Mulitple LLM calls single batch

In [5]:
query = 'What does Cal Newport have to say about avoiding distractions'

In [6]:
system_msg = """
You are an AI language model assistant. Your task is to generate {n}
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines.\n\nOriginal question: {question}
""".format(n=3, question=query)

In [7]:
response = azure.chat_completion(system_msg, temperature=1.0, raw_response=False,  api_key=os.environ['AZURE_OPENAI_API_KEY'],
                      api_version=os.environ['AZURE_OPENAI_API_VERSION'],
                      api_base=os.environ['AZURE_OPENAI_ENDPOINT'])

TypeError: litellm.main.completion() got multiple values for keyword argument 'api_key'

In [120]:
response

"1. How does Cal Newport discuss strategies for minimizing distractions?\n2. What are Cal Newport's recommendations for staying focused and avoiding distractions?\n3. What insights does Cal Newport provide on the topic of mitigating distractions and maintaining concentration?"

In [123]:
questions = [
                re.sub(r"^[-\d]+[\).\s]", "", question).strip() for question in response.split('\n')
            ]
questions.append(query)
questions

['How does Cal Newport discuss strategies for minimizing distractions?',
 "What are Cal Newport's recommendations for staying focused and avoiding distractions?",
 'What insights does Cal Newport provide on the topic of mitigating distractions and maintaining concentration?',
 'What does Cal Newport have to say about avoiding distractions']

In [144]:
responses = batch_completion(model=f'azure/{azure_model}', messages=messages, temperature=1.0, max_tokens=500,  api_key=os.environ['AZURE_OPENAI_API_KEY'],
                      api_version=os.environ['AZURE_OPENAI_API_VERSION'],
                      api_base=os.environ['AZURE_OPENAI_ENDPOINT'])

In [129]:
retrievals = [weave_client.hybrid_search(request=query,
                                     collection_name=collections[2],
                                     return_properties=['content', 'title', 'summary','guest'],
                                     limit=3
                                    ) for query in questions]

In [131]:
assist_messages = [generate_prompt_series(q, retrievals[i]) for i, q in enumerate(questions)]

In [138]:
messages = [ [{'role':'system','content':huberman_system_prompt},
               {'role':'assistant', 'content': mess}] for mess in assist_messages]

In [146]:
for response in responses:
    print(response.choices[0].message.content)

In [15]:
cohere_messages = [{'role':'system','content':huberman_system_prompt},
                   {'role':'assistant', 'content': assistant_message}]

In [20]:
cohere = LLM(model_name='command-r-plus', api_key=os.environ['COHERE_API_KEY'])

In [21]:
cohere.chat_completion(system_message=huberman_system_prompt,
                       assistant_message=assistant_message)

'Cal Newport emphasizes the importance of eliminating distractions to optimize cognitive performance and enhance focus and productivity. He suggests that individuals should minimize their engagement with distractions such as social media, smartphones, and excessive emails. Newport also cautions against the addictive nature of these technologies, particularly for young people, and recommends reconsidering unrestricted internet usage. By prioritizing deep work and creating specific protocols for different tasks, individuals can improve their ability to focus and perform at their cognitive best.'