In [1]:
from IPython.display import display, Markdown

In [21]:
from vertexai.preview.generative_models import GenerativeModel

In [50]:
from vertexai import generative_models

# Safety config
SAFETY_CONFIG = {
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
    generative_models.HarmCategory.HARM_CATEGORY_UNSPECIFIED: generative_models.HarmBlockThreshold.BLOCK_NONE,
}

In [51]:
def gemini_api(prompt: str, temperature = 0) -> str:
    model = GenerativeModel("gemini-pro", safety_settings=SAFETY_CONFIG)
    response = model.generate_content(
        prompt
    )
    output = response.text
    return output


In [62]:
doctors_notes = """49 y/o Male with chronic macular rash to face & hair, worse in beard, eyebrows & nares.
Itchy, flaky, slightly scaly. Moderate response to OTC steroid cream"""

In [63]:
from pydantic import BaseModel, Field
from typing import List
class Symptom(BaseModel):
    symptom: str = Field(description="Symptom that a patient is experiencing")
    affected_area: str= Field(description="What part of the body the symptom is affecting")

class Medication(BaseModel):
    medication: str = Field(description="Name of the medication the patient is taking")
    response: str = Field(description="How the patient is responding to the medication")

class PatientInfo(BaseModel):
    gender: str = Field(description="Patient's gender")
    age: int = Field(description="Patient's age")
    symptoms: List[Symptom] = Field(description="Symptoms that the patient is currently experiencing. Each symptom should be classified into  separate item in the list.")
    current_meds: List[Medication] = Field(description="Medications the patient is currently taking and their response")

In [64]:
from guardrails.validators import ValidRange, ValidChoices

class Symptom(BaseModel):
    symptom: str = Field(description="Symptom that a patient is experiencing")
    affected_area: str= Field(
        description="What part of the body the symptom is affecting",
        #(2)!
        validators=[ValidChoices(choices=['head', 'neck', 'chest'], on_fail='reask')]
    ) 

class Medication(BaseModel):
    medication: str = Field(description="Name of the medication the patient is taking")
    response: str = Field(description="How the patient is responding to the medication")

class PatientInfo(BaseModel):
    gender: str = Field(description="Patient's gender")
    age: int = Field(
        description="Patient's age",
        #(1)!
        validators=[ValidRange(min=0, max=100)]
    )
    symptoms: List[Symptom] = Field(description="Symptoms that the patient is currently experiencing. Each symptom should be classified into  separate item in the list.")
    current_meds: List[Medication] = Field(description="Medications the patient is currently taking and their response")

In [65]:
prompt = """
Given the following doctor's notes about a patient,
please extract a dictionary that contains the patient's information.  <!-- (1)! -->

${doctors_notes}  <!-- (2)! -->

${gr.complete_json_suffix_v2}  <!-- (3)! -->
"""

In [66]:
from pydantic import BaseModel, Field
from typing import List
from guardrails.validators import ValidRange, ValidChoices

prompt = """
Given the following doctor's notes about a patient, please extract a dictionary that contains the patient's information.

${doctors_notes}

${gr.complete_json_suffix_v2}
"""

class Symptom(BaseModel):
    symptom: str = Field(description="Symptom that a patient is experiencing")
    affected_area: str = Field(description="What part of the body the symptom is affecting", validators=[ValidChoices(choices=['head', 'neck', 'chest'], on_fail="reask")])

class Medication(BaseModel):
    medication: str = Field(description="Name of the medication the patient is taking")
    response: str = Field(description="How the patient is responding to the medication")


class PatientInfo(BaseModel):
    gender: str = Field(description="Patient's gender")
    age: int = Field(validators=[ValidRange(min=0, max=100, on_fail="fix")])
    symptoms: List[Symptom] = Field(description="Symptoms that the patient is currently experiencing. Each symptom should be classified into a separate item in the list.")
    current_meds: List[Medication] = Field(description="Medications the patient is currently taking and their response")

In [67]:
import guardrails as gd

# From pydantic:
guard = gd.Guard.from_pydantic(output_class=PatientInfo, prompt=prompt)

In [68]:
# Function that takes the prompt as a string and returns the LLM output as string
def my_llm_api(prompt: str, **kwargs) -> str:
    """Custom LLM API wrapper.

    Args:
        prompt (str): The prompt to be passed to the LLM API
        **kwargs: Any additional arguments to be passed to the LLM API

    Returns:
        str: The output of the LLM API
    """

    output = gemini_api(prompt)

    return output

In [73]:
from searcharray import SearchArray
import pandas as pd
import numpy as np

In [74]:
chat_transcript = [
  "Hi this is Doug, I'd like to complain about the weather",
  "Doug, this is Tom, support for Earth's Climate, how can we help?",
  "Tom, can I speak to your manager?",
  "Hi, this is Sue, Tom's boss. What can I do for you?",
  "I'd like to complain about the ski conditions in West Virginia"
]

msgs = pd.DataFrame({"name": ["Doug", "Doug", "Tom", "Sue", "Doug"],
                     "msg": chat_transcript})
msgs

Unnamed: 0,name,msg
0,Doug,"Hi this is Doug, I'd like to complain about th..."
1,Doug,"Doug, this is Tom, support for Earth's Climate..."
2,Tom,"Tom, can I speak to your manager?"
3,Sue,"Hi, this is Sue, Tom's boss. What can I do for..."
4,Doug,I'd like to complain about the ski conditions ...


In [75]:
msgs['msg_tokenized'] = SearchArray.index(msgs['msg'])
msgs

Unnamed: 0,name,msg,msg_tokenized
0,Doug,"Hi this is Doug, I'd like to complain about th...","Terms({'complain', 'weather', 'this', 'about',..."
1,Doug,"Doug, this is Tom, support for Earth's Climate...","Terms({'help?', 'this', 'can', 'we', 'Tom,', '..."
2,Tom,"Tom, can I speak to your manager?","Terms({'can', 'to', 'Tom,', 'manager?', 'I', '..."
3,Sue,"Hi, this is Sue, Tom's boss. What can I do for...","Terms({'this', 'can', 'boss.', 'do', 'Hi,', ""T..."
4,Doug,I'd like to complain about the ski conditions ...,"Terms({'complain', 'to', 'Virginia', 'the', 'W..."


In [81]:
msgs['score'] = msgs['msg_tokenized'].array.score("weather")
msgs.sort_values('score', ascending=False)

Unnamed: 0,name,msg,msg_tokenized,score
0,Doug,"Hi this is Doug, I'd like to complain about th...","Terms({'complain', 'weather', 'this', 'about',...",0.620554
1,Doug,"Doug, this is Tom, support for Earth's Climate...","Terms({'help?', 'this', 'can', 'we', 'Tom,', '...",0.0
2,Tom,"Tom, can I speak to your manager?","Terms({'can', 'to', 'Tom,', 'manager?', 'I', '...",0.0
3,Sue,"Hi, this is Sue, Tom's boss. What can I do for...","Terms({'this', 'can', 'boss.', 'do', 'Hi,', ""T...",0.0
4,Doug,I'd like to complain about the ski conditions ...,"Terms({'complain', 'to', 'Virginia', 'the', 'W...",0.0


In [1]:
from vectorengine import VectorEngine

In [2]:
ec_store = VectorEngine("/Users/arjun/Documents/github/research-agent/docs/sop-docs/euroclear", collection_name="ec_sop")

Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 0
Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Add of existing embedding ID: 4
Add of existing embedding ID: 5
Add of existing embedding ID: 6
Add of existing embedding ID: 7
Add of existing embedding ID: 8
Add of existing embedding ID: 9
Add of existing embedding ID: 0
Add of e

In [3]:
ec_store.doc_df

Unnamed: 0,id,file_path,title,content,content_vector,keywords,vector_id
0,1,/Users/arjun/Documents/github/research-agent/d...,external-settlement,## External settlement\n19/09/2022\n\n### What...,"[-0.014352154918015003, -0.008465911261737347,...",- external-settlement\n- Euroclear client\n- c...,0
1,2,/Users/arjun/Documents/github/research-agent/d...,transaction-lifecycle,## What is the lifecycle of transactions?\n\nT...,"[-0.018114324659109116, -0.008524619974195957,...",- transaction-lifecycle\n- Input\n- Validation...,1
2,3,/Users/arjun/Documents/github/research-agent/d...,bridge-settlement,## Bridge settlement\n\n### What is a Bridge s...,"[-0.013597985729575157, 0.005742959678173065, ...",- Bridge settlement\n- Euroclear Bank\n- Clear...,2
3,4,/Users/arjun/Documents/github/research-agent/d...,status-reporting,"## Unmatched, unsettled, alleged reporting\n21...","[-0.02043256163597107, -0.014711612835526466, ...",- status-reporting\n- unmatched\n- unsettled\n...,3
4,5,/Users/arjun/Documents/github/research-agent/d...,internal-settlement,## What is an internal settlement transaction?...,"[-0.03241455554962158, 0.045177578926086426, -...",internal-settlement\nEuroclear Bank\npayment c...,4
5,6,/Users/arjun/Documents/github/research-agent/d...,australia-market,## Australia - Market basics\n31/01/2024\n\n##...,"[-0.005053657107055187, -0.01641596108675003, ...",- Australia\n- Market basics\n- Equities servi...,5
6,7,/Users/arjun/Documents/github/research-agent/d...,united-kingdon-market,## United Kingdom - Market basics\n04/11/2021\...,"[-0.007458867039531469, -0.010519517585635185,...",- United Kingdom\n- Market basics\n- Direct li...,6
7,8,/Users/arjun/Documents/github/research-agent/d...,china-market,## China - Market basics\n10/11/2023\n\n### Wh...,"[0.00048036323278211057, -0.017352476716041565...",- china-market\n- Direct account in SHCH\n- In...,7
8,9,/Users/arjun/Documents/github/research-agent/d...,hong-kong-market,## Hong Kong - Market basics\n17/11/2023\n\n##...,"[-0.0014984990702942014, -0.02539912983775139,...",- Hong Kong\n- Market basics\n- Safekeeping\n-...,8
9,10,/Users/arjun/Documents/github/research-agent/d...,united-states-market,## United States - Market basics\n\n### Safeke...,"[0.004938406404107809, -0.007323066238313913, ...",- United States\n- Market basics\n- Safekeepin...,9


In [6]:
fusion = ec_store.rank_fusion("china", 4, 4)

<class 'list'>
['china']


                 with a different tokenizer.

                 Also. This is slow.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['bm25_ranking'].fillna(default_bm25_rank, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['dense_ranking'].fillna(default_dense_rank, inplace=True)
The behavior wi

In [7]:
fusion

Unnamed: 0,id,file_path,title_bm25,content_bm25,content_vector,keywords_bm25,vector_id,clean_content,bm25_tokenized,bm25_score,bm25_ranking,score,title_dense,content_dense,keywords_dense,dense_ranking,rank_score
1,2,/Users/arjun/Documents/github/research-agent/d...,transaction-lifecycle,## What is the lifecycle of transactions?\n\nT...,"[-0.018114324659109116, -0.008524619974195957,...",- transaction-lifecycle\n- Input\n- Validation...,1.0,is lifecycle transactions there are possibly s...,"Terms({'confirmation', 'unsettled', '11', '000...",0.0,4.0,,,,,5.0,4.444444
3,7,,,,,,,,Terms(set()),,5.0,0.617657,united-kingdon-market,## United Kingdom - Market basics\n04/11/2021\...,- United Kingdom\n- Market basics\n- Direct li...,4.0,4.444444
0,1,/Users/arjun/Documents/github/research-agent/d...,external-settlement,## External settlement\n19/09/2022\n\n### What...,"[-0.014352154918015003, -0.008465911261737347,...",- external-settlement\n- Euroclear client\n- c...,0.0,external settlement 19 09 2022 is external set...,"Terms({'custodian', 'linked', 'nederland', 're...",0.0,3.0,,,,,5.0,3.75
5,9,/Users/arjun/Documents/github/research-agent/d...,hong-kong-market,## Hong Kong - Market basics\n17/11/2023\n\n##...,"[-0.0014984990702942014, -0.02539912983775139,...",- Hong Kong\n- Market basics\n- Safekeeping\n-...,8.0,hong kong market basics 17 11 2023 safekeeping...,"Terms({'difference', 'delivery', 'info', 'auth...",0.66042,2.0,0.725239,hong-kong-market,## Hong Kong - Market basics\n17/11/2023\n\n##...,- Hong Kong\n- Market basics\n- Safekeeping\n-...,2.0,2.0


In [9]:
out = ec_store.search_df("china", 4)

In [10]:
out

Unnamed: 0,id,score,title,content,keywords
9,6,0.755118,united-states-market,## United States - Market basics\n\n### Safeke...,- United States\n- Market basics\n- Safekeepin...
8,9,0.725239,hong-kong-market,## Hong Kong - Market basics\n17/11/2023\n\n##...,- Hong Kong\n- Market basics\n- Safekeeping\n-...
7,8,0.663331,china-market,## China - Market basics\n10/11/2023\n\n### Wh...,- china-market\n- Direct account in SHCH\n- In...
6,7,0.617657,united-kingdon-market,## United Kingdom - Market basics\n04/11/2021\...,- United Kingdom\n- Market basics\n- Direct li...


In [11]:
bmout = ec_store.bm25_rank("china", out, k=4)

<class 'list'>
['china']


In [12]:
bmout

Unnamed: 0,id,score,title,content,keywords,clean_content,bm25_tokenized,bm25_score
7,8,0.663331,china-market,## China - Market basics\n10/11/2023\n\n### Wh...,- china-market\n- Direct account in SHCH\n- In...,china market basics 10 11 2023 what's specific...,"Terms({'description', 'difference', 'delivery'...",0.539762
8,9,0.725239,hong-kong-market,## Hong Kong - Market basics\n17/11/2023\n\n##...,- Hong Kong\n- Market basics\n- Safekeeping\n-...,hong kong market basics 17 11 2023 safekeeping...,"Terms({'difference', 'delivery', 'info', 'auth...",0.312336
9,6,0.755118,united-states-market,## United States - Market basics\n\n### Safeke...,- United States\n- Market basics\n- Safekeepin...,united states market basics safekeeping govern...,"Terms({'difference', 'delivery', 'info', '1bd'...",0.0
6,7,0.617657,united-kingdon-market,## United Kingdom - Market basics\n04/11/2021\...,- United Kingdom\n- Market basics\n- Direct li...,united kingdom market basics 04 11 2021 what's...,"Terms({'request', 'description', 'difference',...",0.0


In [None]:
# do bm25 first then vector? nope. maybe just bm25