In [2]:
import requests
import json

import os
import base64

In [3]:
def elastic_request(data=None, method=None, url=None):
    if method == None:
        method = requests.get
    if data:
        data = json.dumps(data)    

    return method(f"https://192.168.1.153:9200/{url}",
                  headers={
                    "Content-Type": "application/json",
                    "Accept": "application/json",
                    "Authorization": f"ApiKey {os.getenv('ELASTIC_API_KEY')}"
                  },
                  verify=False,
                  data=data)

In [None]:
inf_endpt = elastic_request(url="/_inference/text_embedding/open-ai-embeddings",
                            method=requests.put,
                            data={
                                "service": "openai",
                                "service_settings": {
                                    "api_key": os.getenv("OPENAI_API_KEY"),
                                    "model_id": "text-embedding-3-small"
                                }
                            })
inf_endpt.json()

In [None]:
load_docs = elastic_request(url="_ingest/pipeline/attach_and_chunk", method=requests.put, data={
  "description" : "Extract attachment and separate into paragraphs",
  "processors" : [
    {
      "attachment": {
        "field": "data",
        "indexed_chars": -1,
        "remove_binary": True
      }
    },
    {
      "script": {
        "description": "Chunk each attachment into individual sentences. This will be a nested field",
        "source": """
          String[] envSplit = /((?<!M(r|s|rs)\.)(?<=\.) |(?<=\!) |(?<=\?) )/.split(ctx['attachment']['content']);
          ctx['passages'] = new ArrayList();
          int i = 0;
          boolean remaining = true;
          if (envSplit.length == 0) {
            return
          } else if (envSplit.length == 1) {
            Map passage = ['text': envSplit[0]];
            ctx['passages'].add(passage)
          } else {
            while (remaining) {
              Map passage = ['text': envSplit[i++]];
              while (i < envSplit.length && passage.text.length() + envSplit[i].length() < params.passage_size) {passage.text = passage.text + ' ' + envSplit[i++]}
              if (i == envSplit.length) {remaining = false}
              ctx['passages'].add(passage)
            }
          }
          """,
          "params": {
              "passage_size": 600
          }    
      }
    }
  ]
})
load_docs.json()

In [None]:
embed_each = elastic_request(url="_ingest/pipeline/clean_and_embed", method=requests.put, data={
    "description" : "Remove links and generate embeddings",
    "processors" : [{
        "gsub": {
            "field": "passage",
            "pattern": "http?s:\/\/[^\s]+\s",
            "replacement": ""
        },
        "inference": {
            "model_id": "open-ai-embeddings",
            "input_output": {
                "input_field": "passage",
                "output_field": "passage-embedding"
            }
        }
    }]
})
embed_each.json()

In [None]:
doc = "DISMISS\n\n12/21/24, 9:13 PM Adventuring - Player's Handbook (2014) - Dungeons & Dragons - Sources - D&D Beyond\n\nhttps://www.dndbeyond.com/sources/dnd/phb-2014/adventuring 1/11\n\nhttps://www.dndbeyond.com/sources/dnd/phb-2014/using-ability-scores\nhttps://www.dndbeyond.com/sources/dnd/phb-2014/\nhttps://www.dndbeyond.com/sources/dnd/phb-2014/combat\nhttps://www.dndbeyond.com/sources/phb/introduction#HowtoPlay\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\nhttps://marketplace.dndbeyond.com/category/DB0000156?&icid_medium=organic&icid_source=bluebanner&icid_campaign=2024_3p_&icid_content=2024_hype\n\n\nTime\n\nIn situations where keeping track of the passage of time is important, the DM determines the time a task requires."
test = elastic_request(method=requests.post, url="_ingest/pipeline/clean_and_embed/_simulate", data={"docs":[{"_source": {"passage": doc}}]})
test.json()

In [45]:
dir_path = "/data/Player's Handbook 5e (2014)"
encoded_files = []
for filename in os.listdir(dir_path):
    file_path = os.path.join(dir_path, filename)

    if os.path.isfile(file_path):
        with open(file_path, 'rb') as file:
            file_content = file.read()
            encoded_content = base64.b64encode(file_content).decode('utf-8')
            encoded_files.append((filename, encoded_content))

In [None]:
for filename, contents in encoded_files:
    elastic_request(method=requests.put, url=f"players-handbook/_doc/{filename}?pipeline=attach_and_chunk", data={"data": contents})
    

In [None]:
mapping = elastic_request(url="players-handbook-chunked",
                          method=requests.put,
                          data={
                            "mappings": {
                                "dynamic": True,
                                "properties": {
                                    "vector": {
                                        "properties": {
                                            "predicted_value": {
                                                "type": "dense_vector",
                                                "index": True,
                                                "dims": 1536,
                                                "similarity": "cosine"
                                            }
                                        }
                                    }
                                }
                            }
                          })
mapping.json()

In [None]:
data = {
    "query": {
        "match_all": {}
    },
    "size": 10000,
    "sort": [
        {"attachment.date": "asc"}
    ]
}

r = elastic_request(url="players-handbook/_search?scroll=1m",
                    method=requests.post,
                    data=data).json()
scroll_id = r["_scroll_id"]
while len(r["hits"]["hits"]) > 0:
    print(r["hits"]["hits"])
    bulk_submit = [[{"index": {}}, {"passage": passage["text"], "section": hit["_id"]}] for hit in r["hits"]["hits"] for passage in hit["_source"]["passages"]]
    payload="\n".join([json.dumps(j) for entry in bulk_submit for j in entry])
    # use bulk endpoint to submit each paragraph as a new document
    create = requests.post("https://192.168.1.153:9200/players-handbook-chunked/_bulk?pipeline=clean_and_embed",
                            headers={
                                "Content-Type": "application/x-ndjson",
                                "Accept": "application/json",
                                "Authorization": f"ApiKey {os.getenv('ELASTIC_API_KEY')}"
                            },
                            verify=False,
                            data=f"{payload}\n")
    print(create.json())
    r = elastic_request(url="_search/scroll",
                       data={"scroll": "1m", "scroll_id": scroll_id}).json()

In [4]:
response = elastic_request(url="players-handbook-chunked/_search",
                           data={
                                "query": {
                                    "match": {
                                        "passage": {
                                            "query": "Druid capstone",
                                            "operator": "and",
                                            "boost": 0.4
                                        }
                                    }
                                },
                                "knn": {
                                    "field": "passage-embedding",
                                    "k": 10,
                                    "boost": 0.3,
                                    "num_candidates": 100,
                                    "query_vector_builder": {
                                        "text_embedding": { 
                                            "model_id": "open-ai-embeddings", 
                                            "model_text": "What are the capstone abilities for Druid?" 
                                        }
                                    }
                                } 
                           })

rslt = elastic_request(url="players-handbook-chunked/_search",
                       data={
                            "query": {
                                "match": {
                                    "passage": {
                                        "query": "can I cast Hunter's Mark on an entity who is Invisible?"
                                    }
                                }
                            }
                        })



In [5]:
hits = response.json()["hits"]["hits"]
len(hits)

10

In [6]:
[h["_source"]["passage"] for h in hits]

["DISMISS\n\n12/21/24, 9:11 PM Druid - Player's Handbook (2014) - Dungeons & Dragons - Sources - D&D Beyond\n\n2/10\n\n\n\n\xa0 — Spell Slots per Spell Level —\n\nLevel Proficiency\nBonus Features Cantrips\n\nKnown 1st 2nd 3rd 4th 5th 6th 7th 8th 9th\n\n9th +4 — 3 4 3 3 3 1 — — — —\n\n10th +4 Druid Circle Feature 4 4 3 3 3 2 — — — —\n\n11th +4 — 4 4 3 3 3 2 1 — — —\n\n12th +4 Ability Score Improvement 4 4 3 3 3 2 1 — — —\n\n13th +5 — 4 4 3 3 3 2 1 1 — —\n\n14th +5 Druid Circle Feature 4 4 3 3 3 2 1 1 — —\n\n15th +5 — 4 4 3 3 3 2 1 1 1 —\n\n16th +5 Ability Score Improvement 4 4 3 3 3 2 1 1 1 —\n\n17th +6 — 4 4 3 3 3 2 1 1 1 1\n\n18th +6 Timeless Body, Beast Spells 4 4 3 3 3 3 1 1 1 1\n\n19th +6 Ability Score Improvement 4 4 3 3 3 3 2 1 1 1\n\n20th +6 Archdruid 4 4 3 3 3 3 2 2 1 1\n\nClass Features\n\nAs a druid, you gain the following class features.\n\nHit Points\n\nHit Dice: 1d8 per druid level\n\nHit Points at 1st Level: 8 + your Constitution modifier\n\nHit Points at Higher Levels: 