## Section 1: Fetch Drugs Detail from FDA and Save it into one JSON file
### Step 1: Mount drive and import (this one use google drive)

In [None]:
import requests
import zipfile
import io
import os
import json
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### Step 2: Def the download function

In [None]:
def download_and_extract_json(urls, output_dir="json_files"):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    json_files = []

    for url in urls:
        try:
            # Download the zip file
            response = requests.get(url)
            response.raise_for_status()  # Check for HTTP errors

            # Unzip the downloaded file
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                for file_name in z.namelist():
                    if file_name.endswith('.json'):
                        # Extract the JSON file
                        json_content = z.read(file_name)
                        json_data = json.loads(json_content)

                        # Save the JSON file
                        json_path = os.path.join(output_dir, file_name)
                        with open(json_path, 'w') as json_file:
                            json.dump(json_data, json_file, indent=4)
                        json_files.append(json_path)

                        print(f"Downloaded and extracted: {file_name}")

        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {e}")
        except zipfile.BadZipFile as e:
            print(f"Failed to unzip file from {url}: {e}")
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON from {url}: {e}")

    return json_files


### Step 3: Start Download

In [None]:
# Example usage
urls = [
    'https://download.open.fda.gov/drug/label/drug-label-0001-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0002-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0003-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0004-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0005-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0006-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0007-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0008-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0009-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0010-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0011-of-0012.json.zip',
    'https://download.open.fda.gov/drug/label/drug-label-0012-of-0012.json.zip'


    # Add more URLs as needed
]

output_dir = '/content/drive/MyDrive/FDA_Data_Set'
os.makedirs(output_dir, exist_ok=True)

downloaded_json_files = download_and_extract_json(urls,output_dir=output_dir)

Downloaded and extracted: drug-label-0001-of-0012.json
Downloaded and extracted: drug-label-0002-of-0012.json
Downloaded and extracted: drug-label-0003-of-0012.json
Downloaded and extracted: drug-label-0004-of-0012.json
Downloaded and extracted: drug-label-0005-of-0012.json
Downloaded and extracted: drug-label-0006-of-0012.json
Downloaded and extracted: drug-label-0007-of-0012.json
Downloaded and extracted: drug-label-0008-of-0012.json
Downloaded and extracted: drug-label-0009-of-0012.json
Downloaded and extracted: drug-label-0010-of-0012.json
Downloaded and extracted: drug-label-0011-of-0012.json
Downloaded and extracted: drug-label-0012-of-0012.json


### Step 4: Ask LLM to Extra the Most Important JSON Properties:

#### Prompt:
```
This is a JSON object for a drug label. Say if you are a regular drug user, what would the the top 10 items you are most interested in?
Output should follow this JSON format, for example
{ description: "manufacturer", property: "openfda.manufacturer_name"}
```
####File Input (Single JSON object)

https://www.icloud.com/attachment/?u=https%3A%2F%2Fcvws.icloud-content.com%2FB%2FARW0mK0u5NMVvgWPKpruotC21r0FARStkgPOxx-tDwA6zw_MKccSeSmZ%2F%24%7Bf%7D%3Fo%3DAmXWpsdRZ_s3-1Ka3DxM39V0tZ0wfYrg4UDXRjAcLWSC%26v%3D1%26x%3D3%26a%3DCAogrUaKeKFBzAU-K1panPWpjKqei45e4Zo0jgG5ZilphxcSbxCy0vCrmDIYsuLr_6EyIgEAUgS21r0FWgQSeSmZaidJxGCo1JYtduGCwtBKepEj-YsAIU3YjuFh8OOeNLrCWscCdyLAjq1yJ52I9nsfL8D_GS21vc9Mu1z4oqxLUPGmM2CYRPJi6v9wKQSlir6-Kw%26e%3D1727113392%26fl%3D%26r%3DCDD5711F-E306-41A5-948C-7E730424ED8C-1%26k%3D%24%7Buk%7D%26ckc%3Dcom.apple.clouddocs%26ckz%3DiCloud.me.damir.dropover-mac%26p%3D140%26s%3DCBVbwaUrWC6v0w7EOhg07-C3k0g&uk=oDsC6nvo0z2mC_HJiFMXCw&f=singleJSON_Template.docx&sz=47205

#### Output
```
[
    { "description": "Drug Name", "property": "openfda.brand_name" },
    { "description": "Generic Name", "property": "openfda.generic_name" },
    { "description": "Manufacturer", "property": "openfda.manufacturer_name" },
    { "description": "Active Ingredients", "property": "active_ingredient" },
    { "description": "Purpose", "property": "purpose" },
    { "description": "Indications and Usage", "property": "indications_and_usage" },
    { "description": "Dosage and Administration", "property": "dosage_and_administration" },
    { "description": "Warnings", "property": "warnings" },
    { "description": "Adverse Reactions", "property": "adverse_reactions" },
    { "description": "Storage Information", "property": "storage_and_handling" }
]
```

### Step 5: Start Extrating Info from JSONs

In [None]:
import os
import json
from typing import List

# Define the properties to extract
curated_properties = [
    { "description": "Drug_Name", "property": "openfda.brand_name" },
    { "description": "ID", "property": "id" },
    { "description": "Manufacturer", "property": "openfda.manufacturer_name" },
    { "description": "Purpose", "property": "purpose" },
    { "description": "Indications_and_Usage", "property": "indications_and_usage" },
    { "description": "Active_Ingredients", "property": "active_ingredient" },
    { "description":"do_not_USE","property":"do_not_use"},
    {"description":"STOP_USE","property":"stop_use"},
    {"description":"DOSAGE_INSTRUCTION","property":"dosage_and_administration"},
    { "description": "Storage_Information", "property": "storage_and_handling" },
    { "description": "Ask_Doctor", "property": "ask_doctor" },
]

# Define the search and match list
contain = ["HUMAN OTC DRUG"]

def flatten_json(y):
    """Flatten a nested JSON object into a single string."""
    out = []

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out.append(str(x))

    flatten(y)
    return ' '.join(out)

def matches_search(flattened_text: str, search_terms: List[str]) -> bool:
    """Check if any of the search terms are in the flattened text."""
    return any(term in flattened_text for term in search_terms)

def curate_json(json_data: List[dict]) -> List[dict]:
    curated_list = []
    for item in json_data:
        flattened_text = flatten_json(item)
        if matches_search(flattened_text, contain):
            curated_item = {}
            for prop in curated_properties:
                value = extract_nested_value(item, prop['property'])
                if value:
                    curated_item[prop['description']] = value
            if curated_item:
                curated_list.append(curated_item)
    return curated_list

def extract_nested_value(data, key_sequence):
    keys = key_sequence.split('.')
    for key in keys:
        data = data.get(key, None)
        if data is None:
            return None
    return data

# Load all JSON files in a directory
directory_path = '/content/drive/MyDrive/FDA_Data_Set'  # Replace with your directory path
all_curated_data = []

for filename in os.listdir(directory_path):
    if filename.endswith('.json'):
        with open(os.path.join(directory_path, filename), 'r') as f:
            data = json.load(f)
            if 'results' in data:
                json_array = data['results']
                curated_data = curate_json(json_array)
                all_curated_data.extend(curated_data)

# Save the final curated JSON array to a file
output_file = '/content/drive/MyDrive/test-extration/curated_data.json'
with open(output_file, 'w') as f:
    json.dump(all_curated_data, f, indent=4)

print(f"Curated data has been saved to {output_file}")

Curated data has been saved to /content/drive/MyDrive/test-extration/curated_data.json


## Step 6: Sort An Upload to Upstash Vector Store

# New Section

1. Install Upstash



In [None]:
%pip install upstash_vector

Collecting upstash_vector
  Downloading upstash_vector-0.5.0-py3-none-any.whl.metadata (11 kB)
Collecting httpx<1,>=0.23.0 (from upstash_vector)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->upstash_vector)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->upstash_vector)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading upstash_vector-0.5.0-py3-none-any.whl (14 kB)
Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/

2. Initiate Credentials

In [None]:
from upstash_vector import Index
from google.colab import userdata

index = Index(url=userdata.get("UPSTASH_VECTOR_REST_URL"), token=userdata.get("UPSTASH_VECTOR_REST_TOKEN"))

3. Upsert Data into Upstash (Test)

In [None]:
import json

# Your JSON object
json_obj = {
    "Drug_Name": ["SPF 50 Mineral Sunscreen Babyganics"],
    "ID": "f3ca0977-4121-4da8-9a14-e4fcdd2e932d",
    "Manufacturer": ["KAS Direct LLC dba BabyGanics"],
    "Purpose": ["Purpose Sunscreen"],
    "Indications_and_Usage": ["Uses helps prevent sunburn if used as directed with"],
    "Active_Ingredients": ["Active ingredients Titanium dioxide 3%, Zinc oxide 12%"],
    "do_not_USE": ["Do not use on damaged or broken skin"],
    "STOP_USE": ["Stop use and ask a doctor if rash occurs"],
    "DOSAGE_INSTRUCTION": ["Directions apply liberally 15 minutes"],
    "Storage_Information": ["Other information protect this product from excessive heat and direct sun"]
}

# Extract values
id_value = json_obj["ID"]
metadata_field = {
    "ID": json_obj["ID"],
    "Drug_Name": json_obj["Drug_Name"][0],
    "Manufacturer": json_obj["Manufacturer"][0]
}
raw_data = json.dumps({k: ', '.join(v) if isinstance(v, list) else v for k, v in json_obj.items() if k != "ID"})

# Prepare data for upsert
index.upsert(
    vectors=[
        (id_value, raw_data, metadata_field),
    ]
)

'Success'

4. Dump All Data Into Vector Store

In [None]:
import json

json_file_path = '/content/drive/MyDrive/test-extration/curated_data.json'

# Open and read the JSON file
with open(json_file_path, 'r') as file:
    json_array = json.load(file)


# Optional: start from a certain point with id
start_index = 20774

new_array = json_array[start_index:]

# Iterate over each object in the array
for current_index,obj in enumerate(new_array):
    # Extract values
    id_value = obj["ID"]
    metadata_field = {
        "ID": obj["ID"],
        "Drug_Name": obj.get("Drug_Name", [""])[0] if obj.get("Drug_Name") else "", # Use get() with a default value
        "Manufacturer": obj.get("Manufacturer", [""])[0] if obj.get("Manufacturer") else "" # Use get() with a default value
    }

    # Remove brackets by joining the list items into a single string
    raw_data = json.dumps({k: ', '.join(v) if isinstance(v, list) else v for k, v in obj.items() if k != "ID"})

    # Prepare data for upsert
    index.upsert(
        vectors=[
            (id_value, raw_data, metadata_field),
        ]
    )
    print(f"Finish Stashing {obj['ID']}, done {current_index} of {len(json_array)} ")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Finish Stashing 204387c0-3dc3-98ec-e063-6394a90ace04, done 24021 of 49795 
Finish Stashing 2047cee4-f8da-9a38-e063-6394a90ac921, done 24022 of 49795 
Finish Stashing addd1856-65cc-46da-a49f-f0b5a99ec5ca, done 24023 of 49795 
Finish Stashing c732aeb0-c867-9e33-e053-2a95a90a129e, done 24024 of 49795 
Finish Stashing 0d98477b-aa34-ebb7-e063-6294a90a597f, done 24025 of 49795 
Finish Stashing 1bc7f2f9-1698-53c7-e063-6394a90a61dc, done 24026 of 49795 
Finish Stashing 59af2008-8ecb-468c-8127-5656cda70c1f, done 24027 of 49795 
Finish Stashing e3ae7662-8e5e-430f-b07d-5c83826fde04, done 24028 of 49795 
Finish Stashing 1e85b0de-e071-00b1-e063-6294a90ace4f, done 24029 of 49795 
Finish Stashing 1f7f09d6-4768-3535-e063-6394a90a8b3b, done 24030 of 49795 
Finish Stashing 202ffb02-d83f-8658-e063-6394a90a2ccc, done 24031 of 49795 
Finish Stashing 3feaf33d-3922-44f5-a298-b7e35c545a7e, done 24032 of 49795 
Finish Stashing 472b0b58-519d-4d12-

In [None]:
# Optional
# get the array starting point

import json

json_file_path = '/content/drive/MyDrive/test-extration/curated_data.json'

# Open and read the JSON file
with open(json_file_path, 'r') as file:
    json_array = json.load(file)

target_id="5de4a406-8d7d-499f-ac4b-4ea088a9680b"

def find_index_by_id(json_array, target_id):
    for index, obj in enumerate(json_array):
        if obj.get('ID') == target_id:
            return index
    return -1  # Return -1 if id not found

index_position = find_index_by_id(json_array, target_id)
print(f"Index position of object with id {target_id}: {index_position}")




Index position of object with id 5de4a406-8d7d-499f-ac4b-4ea088a9680b: 20774


https://dailymed.nlm.nih.gov/dailymed/fda/fdaDrugXsl.cfm?setid=5453cd31-e70b-4f6c-81ce-206b960a54bc

## Section 2: Semantic Router


This is used to solve the attention issue associated with small context model, by detecting if user has leave the current "subtopic", we can decide whether or not to regenerate the context

In [2]:
%pip install -qU semantic-router

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.4/83.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.8/207.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.9/362.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from semantic_router import Route

# use this as a guard rail, we will only allow these two types of conversation
medical = Route(
    name="medical",
    utterances=[
        "Is this effective on fever?",
        "Is Tylenol safe for pregnant women?"
        "When did FDA approve this pill?",
        "What is the chemical composition of this drug?",
        "Is Triptans an OTC medicine?",
        "Are there any safety incidents related to this medication?"
        "What is the toxicity of this tablet?",
        "Do you recommend three dose a day?",
        "Is it legal to buy this drug in Ohio?"
    ],
)

# regular conversation
chitchat = Route(
    name="chitchat",
    utterances=[
        "how's the weather today?",
        "how are things going?",
        "Good Morning",
        "Done anything fun last week?",
        "I hope you have a great day",
        "Have you seen that movie recently?"
    ],
)

# medicine

# we place both of our decisions together into single list
routes = [medical, chitchat]

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


In [5]:
import os
from semantic_router.encoders import CohereEncoder, OpenAIEncoder
from google.colab import userdata



# for Cohere
# os.environ["COHERE_API_KEY"] = "<YOUR_API_KEY>"
# encoder = CohereEncoder()

# or for OpenAI
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
encoder = OpenAIEncoder()


In [6]:
from semantic_router.layer import RouteLayer

rl = RouteLayer(encoder=encoder, routes=routes)

In [14]:
# @title Default title text
chathistory = [
    # Unrelated (Normal - Notify)
    "Marvel doesn't produce good movie these day right?",
    "Just checking on you",
    "Is 'Dosing with life' a good YouTube channel?",
    "This soup taste good!",
    # Normal
    "What would happen to me if I take 3 instead of 1 every day?"
    "Dude drug his dog. What a scum!",
    "His pen has surgical precision",
    "A doctor slice up his belly",
    "Should I go to hospital?",
    # Warn (Switch Topic)
    "How strong is this drug?",
    "Morphine has some strange effect on me",
    ""
    # Disengage (Stop)
    "Fentanyl and Heroin, which one can get me high",
    "I want to kill myself",
    "I want to stab you!",
    "You son of bitch",
    "Stupid mother fucker!"
]
for i in chathistory:
  topic = rl(i).name
  print(i+" => ")
  print(topic)
#  rl("saw some be").name
# print(topic)

chitchat
chitchat
medical
medical
None
None
None
None
medical
None
None
None
None


## Use semantic router to check if topic Switch from last

1. First prepare a set of data as sample, cover wide range of medical/medicine related discussion


*   Can ask LLM to generate
*   Output into an array



2. Pass *first user prompt* into the layer, measure the `similarity_score`
3. Keep passing, and monitor the `z-score` / `Rolling Standard Deviation`
4. Is there a more effcient way? Maybe try the built-in `conversation` in `semantic-router` as well? Need to assess both options

### Use SR to Create Tool Function Calls for Agents

1. OutSource => list Websites, use Google API to fetch
2. Meditate => Summon a Widget to
3. Med Stock => Check the Stock price of popular medical company
4. Graph => Create Relationship Diagram