In [1]:
import pandas as pd
import uuid
import anthropic
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

In [2]:
with open('./data/documents.json', 'r') as f:
    docs = f.read()

## Indexing Stage

In [25]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'

In [27]:
model = SentenceTransformer(model_name)



In [28]:
v = model.encode('Do I need to setup a local development environment?')

In [29]:
len(v)

384

In [31]:
# index_settings = {
#   "settings": {
#     "number_of_shards": 1,
#     "number_of_replicas": 0
#   },
#   "mappings": {
#     "properties": {
#       "id": { "type": "keyword" },
#       "metadata": {
#         "type": "object",
#         "properties": {
#           "header_1": { "type": "text" },
#           "header_2": { "type": "text" },
#           "header_3": { "type": "text" }
#         }
#       },
#       "page_content": { 
#         "type": "text"
#       },
#       "page_content_vector": {
#         "type": "dense_vector",
#         "dims": 384,
#         "index": True,
#         "similarity": "cosine"
#       }
#     }
#   }
# }

index_settings = {
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0
  },
  "mappings": {
    "properties": {
      "id": { "type": "keyword" },
      "page_content": { "type": "text" },
      "header_1": { "type": "text" },
      "header_2": { "type": "text" },
      "header_3": { "type": "text" },
      "page_content_vector": {
        "type": "dense_vector",
        "dims": 384,
        "index": True,
        "similarity": "cosine"
      },
     "metadata_vector": {
        "type": "dense_vector",
        "dims": 384,
        "index": True,
        "similarity": "cosine"
      },
     "combined_vector": {
        "type": "dense_vector",
        "dims": 384,
        "index": True,
        "similarity": "cosine"
      }
    }
  }
}

index_name = "contributing_h4la"

In [32]:
es_client = Elasticsearch('http://localhost:9200')

In [33]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'contributing_h4la'})

In [36]:
for doc in tqdm(docs):
    # extract content from doc
    content = doc.get('page_content')
    headers = ' '.join([doc.get(f'header_{i}', '') for i in range(1, 6)])

    # combine headers and content for full text encoding
    combined_text = headers + ' ' + content

    # encode content and headers
    doc['page_content_vector'] = model.encode(content)
    doc['metadata_vector'] = model.encode(headers)
    doc['combined_vector'] = model.encode(combined_text)

  0%|          | 0/60 [00:00<?, ?it/s]

In [38]:
for doc in tqdm(docs):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/60 [00:00<?, ?it/s]