In [1]:
!pip install dlt beautifulsoup4
!pip install dlt[lancedb]
!pip install sentence-transformers
!pip install lancedb
!pip install openai==1.39.0
!pip install langchain
!pip install tantivy

Collecting dlt
  Downloading dlt-0.5.4-py3-none-any.whl.metadata (10 kB)
Collecting gitpython>=3.1.29 (from dlt)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting giturlparse>=0.10.0 (from dlt)
  Downloading giturlparse-0.12.0-py2.py3-none-any.whl.metadata (4.5 kB)
Collecting hexbytes>=0.2.2 (from dlt)
  Downloading hexbytes-1.2.1-py3-none-any.whl.metadata (3.7 kB)
Collecting jsonpath-ng>=1.5.3 (from dlt)
  Downloading jsonpath_ng-1.6.1-py3-none-any.whl.metadata (18 kB)
Collecting makefun>=1.15.0 (from dlt)
  Downloading makefun-1.15.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting orjson!=3.10.1,!=3.9.11,!=3.9.12,!=3.9.13,!=3.9.14,<4,>=3.6.7 (from dlt)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m760.5 kB/s[0m eta [36m0:00:00[0m
Collecting pathvalidate>=2.5.2 (from dlt)
  Downloading pathvalidate-3.2.1-py3-none

In [50]:
!dlt --non-interactive init rest_api lancedb

Looking up the init scripts in [1mhttps://github.com/dlt-hub/verified-sources.git[0m...
No files to update, exiting


In [51]:
import os

os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL_PROVIDER"] = "sentence-transformers"
os.environ["DESTINATION__LANCEDB__EMBEDDING_MODEL"] = "all-MiniLM-L6-v2"
os.environ["DESTINATION__LANCEDB__CREDENTIALS__URI"] = ".lancedb"

In [52]:
import dlt
import re

import time
from dlt.sources.helpers.rest_client.paginators import BasePaginator
from requests import Request, Response
from typing import List, Optional, Any
from rest_api import rest_api_source
from dlt.destinations.adapters import lancedb_adapter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup


BASE_URL = "https://dan.org/wp-json/wp/v2/"
PER_PAGE = 100  # Number of posts or pages per request
START_DATE = "2000-01-01T00:00:00"


# Custom paginator for handling WordPress pagination
class WordPressPaginator(BasePaginator):
    def __init__(self, start_page: int = 1, per_page: int = PER_PAGE):
        self.current_page = start_page
        self.per_page = per_page

    def update_request(self, request: Request) -> None:
        """Updates the request with the current page."""
        if request.params is None:
            request.params = {}
        request.params["page"] = self.current_page
        request.params["per_page"] = self.per_page

    def update_state(
        self, response: Response, data: Optional[List[Any]] = None
    ) -> None:
        """Updates the state to stop pagination if no more data is returned or fewer posts than per_page are returned."""

        if not data or len(data) < self.per_page or response.status_code == 400:
            self._has_next_page = False
        else:
            self.current_page += 1
            self._has_next_page = True


def remove_html_tags(text):
    """Remove HTML tags, JavaScript, and extra spaces from a string."""
    soup = BeautifulSoup(text, "html.parser")

    # Remove all script and iframe tags and their content
    for script in soup(["script", "iframe"]):
        script.extract()

    cleaned_text = soup.get_text(separator=" ")

    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text


def chunk_text(text):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

    return text_splitter.split_text(text)

In [53]:
def wordpress_rest_api_source():
    return rest_api_source(
        {
            "client": {
                "base_url": BASE_URL,
                "paginator": WordPressPaginator(start_page=1),
            },
            "resource_defaults": {
                "primary_key": "id",
                "write_disposition": "merge",
                "endpoint": {
                    "params": {
                        "per_page": PER_PAGE,
                    },
                },
            },
            "resources": [
                {
                    "name": "dan_health_resources",
                    "endpoint": {
                        "path": "dan_health_resources",
                        "params": {
                            "modified_after": {
                                "type": "incremental",
                                "cursor_path": "modified",
                                "initial_value": START_DATE,
                            },
                        },
                    },
                },
                {
                    "name": "dan_alert_diver",
                    "endpoint": {
                        "path": "dan_alert_diver",
                        "params": {
                            "modified_after": {
                                "type": "incremental",
                                "cursor_path": "modified",
                                "initial_value": START_DATE,
                            },
                        },
                    },
                },
                {
                    "name": "dan_diving_incidents",
                    "endpoint": {
                        "path": "dan_diving_incidents",
                        "params": {
                            "modified_after": {
                                "type": "incremental",
                                "cursor_path": "modified",
                                "initial_value": START_DATE,
                            },
                        },
                    },
                },
                {
                    "name": "dan_diseases_conds",
                    "endpoint": {
                        "path": "dan_diseases_conds",
                        "params": {
                            "modified_after": {
                                "type": "incremental",
                                "cursor_path": "modified",
                                "initial_value": START_DATE,
                            },
                        },
                    },
                },
            ],
        }
    )


@dlt.transformer()
def dan_articles(article):
    clean_content = remove_html_tags(article["content"]["rendered"])
    for chunk in chunk_text(clean_content):
        yield chunk


pipeline = dlt.pipeline(
    pipeline_name="dan_articles",
    destination="lancedb",
    dataset_name="dan_articles",
)

start_time = time.time()

data = wordpress_rest_api_source() | dan_articles


load_info = pipeline.run(
    lancedb_adapter(data, embed="value"), table_name="texts", write_disposition="merge"
)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Data loaded in {elapsed_time} seconds")

Data loaded in 6.011815786361694 seconds


In [54]:
import lancedb

db = lancedb.connect(".lancedb")
db.table_names()

['dan_articles____dlt_loads',
 'dan_articles____dlt_pipeline_state',
 'dan_articles____dlt_version',
 'dan_articles___dltSentinelTable',
 'dan_articles___texts']

In [55]:
dbtable = db.open_table("dan_articles___texts")
dbtable.create_fts_index("value", replace=True)
dbtable.to_pandas()

Unnamed: 0,id__,vector__,value,_dlt_load_id,_dlt_id
0,e4e1df62-1b29-5fbf-9a72-a803b16f61aa,"[-0.023741106, 0.05163743, -0.041890323, -0.02...",1 Maintain your equipment regularly and inspec...,1726060651.0296018,vkt7U+xHb0O9Ng
1,5e166b32-d4a2-56d2-9479-d107a10ed030,"[-0.025761394, 0.019844366, 0.0012965753, 0.00...","As divers, we’re all familiar with the demands...",1726060651.0296018,VCqirkfoRhZ8yw
2,6004f22a-b364-5556-b1ec-8b15736f2f4f,"[-0.03556344, -0.014644464, 0.020178348, 0.018...",Health Considerations Most divers are accustom...,1726060651.0296018,ninFu1DpggovHQ
3,b9a4e63a-2231-51d2-a2f8-0bc15fb8e98f,"[0.03319515, 0.027340956, 0.035184104, 0.01526...",Domestic Travel One of the benefits of domesti...,1726060651.0296018,95Kl4wI5bA2Wsw
4,c2732b21-8671-5a2b-a940-9541f71e5f6c,"[0.06936087, -0.0072622653, 0.060164157, 0.051...",often results from consuming improperly handle...,1726060651.0296018,GKunFS3d/uPaew
...,...,...,...,...,...
7494,20119898-7ad5-50cd-aa7c-460bcb3c4e1a,"[0.025591485, 0.05600243, 0.06820746, 0.069318...",nimble and proficient in the field. I can spen...,1726060651.0296018,DXGo6SCZDdVUaw
7495,954d2cdd-1bda-5659-8851-497a438cc93b,"[-0.007828981, 0.04659055, -0.008417785, 0.076...",Samui chamber with bad news: There would be no...,1726060651.0296018,rO9N+eHfSxGYKA
7496,182f3c66-9599-515e-816a-e4f2baeeb98c,"[0.04416123, 0.077811606, 0.07683663, 0.054680...",time. I really thought I might die. I kept rol...,1726060651.0296018,mgEVVWxo3XrkLg
7497,96079b41-72e0-5c34-83f5-a3d588c77055,"[0.0060517187, 0.03731707, 0.028205901, 0.0201...","for the island. As we neared shore, I could se...",1726060651.0296018,1I7SevLbXAJfDg


In [56]:
query = "dive without air"
dbtable.search(query, query_type="hybrid").limit(100).to_pandas()

Unnamed: 0,id__,vector__,value,_dlt_load_id,_dlt_id,_relevance_score
0,0ca4bd76-dd68-590c-8bee-bf6b8861590a,"[-0.009525486, 0.023471527, 0.043379433, 0.022...",and you’re a dive instructor!” I was shocked a...,1726060651.0296018,LH/a+w8I85QiQA,0.027972
1,810d7233-adfa-5952-b55e-968302c639bd,"[-0.031076739, 0.07859176, -0.038904406, 0.025...",A diver with a partially closed tank valve has...,1726060651.0296018,Z7zfSUeLqBLE2g,0.026133
2,5ab00145-b687-5ea1-91be-f592a51ed4c1,"[-0.021555187, 0.02767289, 0.012471225, 0.0595...",a component of that system may not have been t...,1726060651.0296018,SdxV7GrlirIlSQ,0.023528
3,379e4237-e4c0-5878-89c5-02b8f9ecc91e,"[-0.05364082, 0.094781704, 0.012512973, 0.0807...",Background: Quarry dive in 70°F weather. Diver...,1726060651.0296018,SrSumU+NxNqteg,0.023128
4,7a8741d5-8f08-5c35-847a-fb6c11bb47b9,"[-0.02578254, 0.05031116, -0.005546921, -0.005...",DAN’s Smart Guide to Air Consumption Our self-...,1726060651.0296018,lJ98Ur+ZsjXSzg,0.022792
...,...,...,...,...,...,...
95,473bf7e0-e040-56d2-aca1-949eed999d0d,"[-0.02305643, 0.031537313, 0.014966587, -0.014...",must understand and limit predive hyperventila...,1726060651.0296018,Vyu0ymJStK8pwQ,0.009009
96,db2ca6d4-5558-5d0f-a7e3-bf01148dd8dd,"[0.005526683, -0.033376686, 0.0062861093, 0.01...",for an average of 5.3 years. In that period th...,1726060651.0296018,u24Vz3VEw2dh/A,0.009009
97,91b1a27b-5c7e-5bf7-93c5-ebc02f44b08e,"[0.05454018, 0.003197734, 0.050377697, 0.01585...",arms and legs simultaneously. Note: Strenuous ...,1726060651.0296018,P0O9LUdyGbr+GQ,0.008929
98,f6aebe99-4cde-500c-9235-f68d8f558bf9,"[-0.022918835, -0.015303991, 0.067932345, 0.08...",A 49-year-old female diver called the DAN ® Em...,1726060651.0296018,/5sNReCIXBfYCQ,0.008929


In [59]:
def retrieve_context_from_lancedb(dbtable, question, top_k=10):

    query_results = dbtable.search(query, query_type="hybrid").to_pandas()
    results = query_results.sort_values("_relevance_score", ascending=True).nlargest(
        top_k, "_relevance_score"
    )
    context = "\n".join(results["value"])

    return context

In [None]:
db = lancedb.connect(".lancedb")
dbtable = db.open_table("dan_articles___texts")

In [60]:
import openai
import lancedb  # Assuming LanceDB is already imported
from openai import OpenAI
from google.colab import userdata

client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))


db = lancedb.connect(".lancedb")
dbtable = db.open_table("dan_articles___texts")


question = "Max depth of 24 meters with sac rate 23, there are 4 instances when ascend exceeded 10m per minute, min NDL reached is 2 minutes, water temperature is 31, depth variance is more than 10"

context = retrieve_context_from_lancedb(dbtable, question)
print("Context retrieved:\n", context)


messages = [
    {
        "role": "system",
        "content": (
            "You are a scuba diving safety expert specializing in incident analysis. "
            "You have access to a database of real dive incidents and guides from DAN (Divers Alert Network)."
            "Use the following pieces of contextual information to answer the user query:"
            f"{context}"
        ),
    },
    {
        "role": "system",
        "content": (
            "When provided with dive data, your goal is to analyze the dive, check for mistakes made, and provide practical advice on how to improve. "
            "Whenever possible, reference incidents and tips from the database and explain how the diver can avoid similar mistakes in the future. "
            "Your response should be short and to the point, providing clear actionable advice. You must focus on the provided dive, but mention facts from the database using quotes when able."
        ),
    },
]


# Create a user prompt using the retrieved context and dive data
messages.append({"role": "user", "content": f"Dive data: '{question}'."})

# Get the response from ChatGPT
response = client.chat.completions.create(model="gpt-4", messages=messages)
print("----------")
print(response.choices[0].message.content)

Context retrieved:
 out in front of you and squeezed together. Slide your buttocks forward to create more space between it and your hands. Bend your elbows so they point behind you. Make sure your fingers are still pointed forward. Exhale, and press your palms to the board as you lift your hips. Lift your hips up until your back and thighs are off the board. Keep your legs together and straight, and press the bottoms of your feet into the board. Look toward the sky and hold for 30 seconds. Repeat five times. Tip: Your shoulders should be directly over your wrists. NOTE: To avoid an increased risk of decompression sickness, DAN ® recommends that divers avoid strenuous exercise for 24 hours after making a dive. During your annual physical exam or following any changes in your health status, consult your physician to ensure you have medical clearance to dive. © Alert Diver — Q2 2018
checks. Slow down! Conduct equipment checks, and follow your dive plan. Those teeth have been there for mil