In [25]:
!pip install dlt beautifulsoup4
!pip install dlt[lancedb]
!pip install sentence-transformers
!pip install lancedb
!pip install openai==1.39.0
!pip install langchain
!pip install tantivy



In [26]:
!dlt --non-interactive init rest_api lancedb

Looking up the init scripts in [1mhttps://github.com/dlt-hub/verified-sources.git[0m...
No files to update, exiting


In [27]:
import os

os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL_PROVIDER"] = "sentence-transformers"
os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL"] = "all-MiniLM-L6-v2"
os.environ["DESTINATION__LANCEDB__CREDENTIALS__URI"] = ".lancedb"

In [28]:
import dlt
import re

import time
from dlt.sources.helpers.rest_client.paginators import BasePaginator
from requests import Request, Response
from typing import List, Optional, Any
from rest_api import rest_api_source
from dlt.destinations.adapters import lancedb_adapter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup


BASE_URL = "https://dan.org/wp-json/wp/v2/"
PER_PAGE = 100  # Number of posts or pages per request
START_DATE = "2000-01-01T00:00:00"


# Custom paginator for handling WordPress pagination
class WordPressPaginator(BasePaginator):
    def __init__(self, start_page: int = 1, per_page: int = PER_PAGE):
        self.current_page = start_page
        self.per_page = per_page

    def update_request(self, request: Request) -> None:
        """Updates the request with the current page."""
        if request.params is None:
            request.params = {}
        request.params["page"] = self.current_page
        request.params["per_page"] = self.per_page

    def update_state(
        self, response: Response, data: Optional[List[Any]] = None
    ) -> None:
        """Updates the state to stop pagination if no more data is returned or fewer posts than per_page are returned."""

        if not data or len(data) < self.per_page or response.status_code == 400:
            self._has_next_page = False
        else:
            self.current_page += 1
            self._has_next_page = True


def remove_html_tags(text):
    """Remove HTML tags, JavaScript, and extra spaces from a string."""
    soup = BeautifulSoup(text, "html.parser")

    # Remove all script and iframe tags and their content
    for script in soup(["script", "iframe"]):
        script.extract()

    cleaned_text = soup.get_text(separator=" ")

    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text


def chunk_text(text):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

    return text_splitter.split_text(text)

In [29]:
def wordpress_rest_api_source():
    return rest_api_source(
        {
            "client": {
                "base_url": BASE_URL,
                "paginator": WordPressPaginator(start_page=1),
            },
            "resource_defaults": {
                "primary_key": "id",
                "write_disposition": "merge",
                "endpoint": {
                    "params": {
                        "per_page": PER_PAGE,
                    },
                },
            },
            "resources": [
                {
                    "name": "dan_health_resources",
                    "endpoint": {
                        "path": "dan_health_resources",
                        "params": {
                            "modified_after": {
                                "type": "incremental",
                                "cursor_path": "modified",
                                "initial_value": START_DATE,
                            },
                        },
                    },
                },
                {
                    "name": "dan_alert_diver",
                    "endpoint": {
                        "path": "dan_alert_diver",
                        "params": {
                            "modified_after": {
                                "type": "incremental",
                                "cursor_path": "modified",
                                "initial_value": START_DATE,
                            },
                        },
                    },
                },
                {
                    "name": "dan_diving_incidents",
                    "endpoint": {
                        "path": "dan_diving_incidents",
                        "params": {
                            "modified_after": {
                                "type": "incremental",
                                "cursor_path": "modified",
                                "initial_value": START_DATE,
                            },
                        },
                    },
                },
                {
                    "name": "dan_diseases_conds",
                    "endpoint": {
                        "path": "dan_diseases_conds",
                        "params": {
                            "modified_after": {
                                "type": "incremental",
                                "cursor_path": "modified",
                                "initial_value": START_DATE,
                            },
                        },
                    },
                },
            ],
        }
    )


@dlt.transformer()
def dan_articles(article):
    clean_content = remove_html_tags(article["content"]["rendered"])
    for chunk in chunk_text(clean_content):
        yield chunk


pipeline = dlt.pipeline(
    pipeline_name="dan_articles",
    destination="lancedb",
    dataset_name="dan_articles",
)

start_time = time.time()

data = wordpress_rest_api_source() | dan_articles


load_info = pipeline.run(
    lancedb_adapter(data, embed="value"), table_name="texts", write_disposition="merge"
)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Data loaded in {elapsed_time} seconds")

Data loaded in 34.22511267662048 seconds


In [30]:
import lancedb

db = lancedb.connect(".lancedb")
db.table_names()

['dan_articles____dlt_loads',
 'dan_articles____dlt_pipeline_state',
 'dan_articles____dlt_version',
 'dan_articles___dltSentinelTable',
 'dan_articles___texts']

In [31]:
dbtable = db.open_table("dan_articles___texts")
dbtable.create_fts_index("value", replace=True)
dbtable.to_pandas()

Unnamed: 0,id__,vector__,value,_dlt_load_id,_dlt_id
0,2a383174-8860-5260-9cdf-aff879e95d72,"[-0.023741096, 0.05163744, -0.04189035, -0.020...",1 Maintain your equipment regularly and inspec...,1726210247.5344496,+oGwZ/XREU1uGw
1,f21c1fa5-9378-5b93-8ee7-923214c7e7fe,"[-0.025761453, 0.01984438, 0.0012965652, 0.009...","As divers, we’re all familiar with the demands...",1726210247.5344496,+C1CctUcE5+H1A
2,364aaf9f-79e4-50cd-8cad-735b35854ce7,"[-0.03556345, -0.014644477, 0.020178335, 0.018...",Health Considerations Most divers are accustom...,1726210247.5344496,YBZbUQj4Vi3F5Q
3,11a1e191-fed3-5554-b3f2-44141dd798af,"[0.03319513, 0.02734095, 0.035184115, 0.015268...",Domestic Travel One of the benefits of domesti...,1726210247.5344496,1HpJKogh0CNByQ
4,bd8f3f4a-c1c3-531a-b7c1-bfb7341fa1c0,"[0.06936087, -0.007262261, 0.060164157, 0.0512...",often results from consuming improperly handle...,1726210247.5344496,eCqBkoqftcA6og
...,...,...,...,...,...
7494,e96355ec-2845-501e-8902-1301f3c14403,"[0.025591524, 0.056002416, 0.06820748, 0.06931...",nimble and proficient in the field. I can spen...,1726210247.5344496,rz3af1POCs5Nvw
7495,71380744-6574-5d71-a797-e2d421db1c34,"[-0.007828973, 0.046590574, -0.008417802, 0.07...",Samui chamber with bad news: There would be no...,1726210247.5344496,T0m+j8hSvN5Qzw
7496,5b844257-c34b-5a06-bdb5-f527b4eb29db,"[0.04416123, 0.07781163, 0.076836646, 0.054680...",time. I really thought I might die. I kept rol...,1726210247.5344496,6BjBGgyZ5Qa79A
7497,0ceec68f-7f41-578e-b989-260666e5a0cb,"[0.006051706, 0.037317082, 0.028205885, 0.0201...","for the island. As we neared shore, I could se...",1726210247.5344496,P8ClPjDXrGGMTA


In [32]:
query = "dive without air"
dbtable.search(query, query_type="hybrid").limit(100).to_pandas()

Unnamed: 0,id__,vector__,value,_dlt_load_id,_dlt_id,_relevance_score
0,b51bf15f-4e00-5d33-8a2b-ad7bcf972f19,"[-0.009525489, 0.023471512, 0.043379415, 0.022...",and you’re a dive instructor!” I was shocked a...,1726210247.5344496,9RDOOpU0QKAXfw,0.027972
1,77dac802-71bc-560a-a768-516b88186f8f,"[-0.031076763, 0.07859178, -0.038904432, 0.025...",A diver with a partially closed tank valve has...,1726210247.5344496,YXiMtVKHZYx+wA,0.026133
2,f5db4a9a-509e-5788-bc4b-555971714239,"[-0.021555182, 0.027672919, 0.012471238, 0.059...",a component of that system may not have been t...,1726210247.5344496,qRVgDn+niyXssQ,0.023528
3,9e0582f3-a3de-53f2-8c86-089a25089847,"[-0.053640787, 0.0947817, 0.012512966, 0.08071...",Background: Quarry dive in 70°F weather. Diver...,1726210247.5344496,7FBHRGrt5rSRlw,0.023128
4,38157c03-9bc7-5462-9ff5-be554368ba81,"[-0.025782524, 0.05031114, -0.0055469284, -0.0...",DAN’s Smart Guide to Air Consumption Our self-...,1726210247.5344496,J+XAJubYdh8J1w,0.022792
...,...,...,...,...,...,...
95,3ffc76c0-d3ec-521e-a3a0-aa30d55999b4,"[-0.023056429, 0.03153731, 0.014966585, -0.014...",must understand and limit predive hyperventila...,1726210247.5344496,BrnkOofJ3PJZgg,0.009009
96,1b34d82f-559f-5813-a676-63b0f956a54e,"[0.0055266814, -0.033376668, 0.0062861308, 0.0...",for an average of 5.3 years. In that period th...,1726210247.5344496,nNUBuMvSZmQKmA,0.009009
97,ddddf619-274e-517d-ada4-7fcd241266c3,"[0.0545402, 0.0031977699, 0.05037774, 0.015857...",arms and legs simultaneously. Note: Strenuous ...,1726210247.5344496,UBVaR3hl9kgxAg,0.008929
98,b2a04edc-86ce-5b44-929b-191af4c5235f,"[-0.022918839, -0.015304018, 0.06793232, 0.080...",A 49-year-old female diver called the DAN ® Em...,1726210247.5344496,Wl+T3o5N+2iCxA,0.008929


In [33]:
def retrieve_context_from_lancedb(dbtable, question, top_k=10):

    query_results = dbtable.search(query, query_type="hybrid").to_pandas()
    results = query_results.sort_values("_relevance_score", ascending=True).nlargest(
        top_k, "_relevance_score"
    )
    context = "\n".join(results["value"])

    return context

In [34]:
db = lancedb.connect(".lancedb")
dbtable = db.open_table("dan_articles___texts")

In [35]:
import openai
import lancedb  # Assuming LanceDB is already imported
from openai import OpenAI
from google.colab import userdata

client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))


db = lancedb.connect(".lancedb")
dbtable = db.open_table("dan_articles___texts")


question = "Max depth of 24 meters with sac rate 23, there are 4 instances when ascend exceeded 10m per minute, min NDL reached is 2 minutes, water temperature is 31, depth variance is more than 10"

context = retrieve_context_from_lancedb(dbtable, question)
print("Context retrieved:\n", context)


messages = [
    {
        "role": "system",
        "content": (
            "You are a scuba diving safety expert specializing in incident analysis. "
            "You have access to a database of real dive incidents and guides from DAN (Divers Alert Network)."
            "Use the following pieces of contextual information to answer the user query:"
            f"{context}"
        ),
    },
    {
        "role": "system",
        "content": (
            "When provided with dive data, your goal is to analyze the dive, check for mistakes made, and provide practical advice on how to improve. "
            "Whenever possible, reference incidents and tips from the database and explain how the diver can avoid similar mistakes in the future. "
            "Your response should be short and to the point, providing clear actionable advice. You must focus on the provided dive, but mention facts from the database using quotes when able."
        ),
    },
]


# Create a user prompt using the retrieved context and dive data
messages.append({"role": "user", "content": f"Dive data: '{question}'."})

# # Get the response from ChatGPT
# response = client.chat.completions.create(
#     model="gpt-4",
#     messages=messages
# )
# print("----------")
# print(response.choices[0].message.content)

Context retrieved:
 out in front of you and squeezed together. Slide your buttocks forward to create more space between it and your hands. Bend your elbows so they point behind you. Make sure your fingers are still pointed forward. Exhale, and press your palms to the board as you lift your hips. Lift your hips up until your back and thighs are off the board. Keep your legs together and straight, and press the bottoms of your feet into the board. Look toward the sky and hold for 30 seconds. Repeat five times. Tip: Your shoulders should be directly over your wrists. NOTE: To avoid an increased risk of decompression sickness, DAN ® recommends that divers avoid strenuous exercise for 24 hours after making a dive. During your annual physical exam or following any changes in your health status, consult your physician to ensure you have medical clearance to dive. © Alert Diver — Q2 2018
checks. Slow down! Conduct equipment checks, and follow your dive plan. Those teeth have been there for mil

In [24]:
import openai
import lancedb
import os
from google.colab import userdata

import re


def extract_score(text):
    match = re.search(r"\s*(\d+)", text)
    if match:
        return float(match.group(1))
    else:
        return None


MODEL = "gpt-3.5-turbo"

client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))


def retrieve_context_from_lancedb(dbtable, question, prompt):
    """Retrieve context and mean relevance score based on the provided prompt."""
    query_results = dbtable.search(question, query_type="hybrid").to_pandas()
    results = query_results.sort_values("_relevance_score", ascending=True).nlargest(
        10, "_relevance_score"
    )
    context = "\n".join(results["value"])
    mean_relevance_score = results["_relevance_score"].max()
    return context, mean_relevance_score


def create_text_report(report):
    """Convert dive report data into text."""
    return (
        f"Average depth {report['Average Depth']} meters, "
        f"Maximum depth {report['Maximum Depth']} meters, "
        f"Depth variability {report['Depth Variability']} meters, "
        f"SAC rate {report['SAC Rate']}, "
        f"High Speed Ascend instances {report['High Ascend Speed Count']}, "
        f"Max Ascend Speed {report['Max Ascend Speed']} meters per min, "
        f"Minimal NDL {report['Minimal NDL']} minutes."
    )


def select_best_context_prompt(dbtable, report):
    """Craft and evaluate multiple context retrieval prompts to select the best one."""
    question = create_text_report(report)

    prompts = [
        # "Retrieve relevant information for the following dive scenario: {question}",
        "Given the dive details: {question}, provide related incidents that match any of mentioned metrics.",
        "Find relevant incidents and tips related to the dive described: {question}.",
        "Search the DAN incident database for relevant data on this dive: {question}.",
    ]

    best_prompt = None
    best_max_relevance = -float("inf")

    for prompt in prompts:
        full_prompt = prompt.replace("{question}", question)
        context, mean_relevance_score = retrieve_context_from_lancedb(
            dbtable, question, full_prompt
        )

        if mean_relevance_score > best_max_relevance:
            best_max_relevance = mean_relevance_score
            best_prompt = prompt

    return best_prompt


def evaluate_prompt_with_llm_judge(context, response):
    """Evaluate a prompt using faithfulness based on the provided context."""
    # Faithfulness scoring: how factually consistent the response is with the context (ignoring the input)
    faithfulness_prompt = (
        f"How factually consistent is the following response with the provided context? "
        f"Context: {context}\nResponse: {response}. Score faithfulness based on how much of the output can be directly inferred from the context, ignoring the input. "
        f"Use the following rubric: 1 (no claims inferred), 2 (some claims inferred but mostly inconsistent), 3 (half or more inferred), "
        f"4 (most inferred with little unsupported info), 5 (all claims directly supported)."
    )

    faithfulness_score = extract_score(
        client.chat.completions.create(
            model=MODEL, messages=[{"role": "user", "content": faithfulness_prompt}]
        )
        .choices[0]
        .message.content
    )

    return faithfulness_score


def select_best_full_prompt(context, report):
    """Combine context and question into multiple full prompts and select the best one based on faithfulness and mean relevance from LanceDB."""
    question = create_text_report(report)

    prompts = [
        "Analyze the following dive data and provide advice. Context: {context}. Dive data: {question}.",
        "Using the provided context: {context}, evaluate the dive data: {question} and offer practical advice.",
        "With the context: {context}, analyze the dive data: {question} and give recommendations.",
        "Given the context from DAN: {context}, and the dive data: {question}, what should be improved?",
    ]

    best_prompt = None
    best_overall_score = -float("inf")

    for prompt in prompts:
        full_question_to_llm = prompt.replace("{context}", context).replace(
            "{question}", question
        )
        response = (
            client.chat.completions.create(
                model=MODEL,
                messages=[{"role": "user", "content": full_question_to_llm}],
            )
            .choices[0]
            .message.content
        )
        faithfulness_score = evaluate_prompt_with_llm_judge(context, response)

        if faithfulness_score > best_overall_score:
            best_overall_score = faithfulness_score
            best_prompt = prompt

    return best_prompt, best_overall_score


db = lancedb.connect(".lancedb")
dbtable = db.open_table("dan_articles___texts")

test_report = {
    "Dive Number": 1,
    "Average Depth": 15.8,
    "Maximum Depth": 20,
    "Depth Variability": 7.4,
    "Average Pressure": 76.9,
    "Maximum Pressure": 209,
    "Pressure Variability": 10,
    "Minimal NDL": 14,
    "SAC Rate": 15,
    "Max Ascend Speed": 13,
    "High Ascend Speed Count": 1,
    "Rating": 1,
}

# Step 1: Select the best context retrieval prompt
best_context_prompt = select_best_context_prompt(dbtable, test_report)
context, best_max_relevance = retrieve_context_from_lancedb(
    dbtable, create_text_report(test_report), best_context_prompt
)
print("Best max relevance", best_max_relevance)
print("Best Context Prompt: ", best_context_prompt)

# # Step 2: Select the best full prompt (context + question)
best_full_prompt, best_overall_score = select_best_full_prompt(context, test_report)
print("Best fathfullness score", best_overall_score)
print("Best Full Prompt: ", best_full_prompt)

Best max relevance 0.03125
Best Context Prompt:  Given the dive details: {question}, provide related incidents that match any of mentioned metrics.
Best fathfullness score 5.0
Best Full Prompt:  Analyze the following dive data and provide advice. Context: {context}. Dive data: {question}.
