In [1]:
%pip install llama-index
%pip install llama-index-readers-elasticsearch
%pip install llama-index-vector-stores-opensearch

%pip install llama-index-embeddings-ollama
%pip install ollama
%pip install nest-asyncio

Collecting llama-index
  Downloading llama_index-0.10.58-py3-none-any.whl (6.8 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama-index)
  Downloading llama_index_agent_openai-0.2.9-py3-none-any.whl (13 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_cli-0.1.13-py3-none-any.whl (27 kB)
Collecting llama-index-core==0.10.58 (from llama-index)
  Downloading llama_index_core-0.10.58-py3-none-any.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama-index)
  Downloading llama_index_embeddings_openai-0.1.11-py3-none-any.whl (6.3 kB)
Collecting llama-index-indices-managed-llama-cloud>=0.2.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.2.7-py3-none-any.whl (9.5 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Download

In [2]:
from llama_index.core import SimpleDirectoryReader
reader = SimpleDirectoryReader(input_dir="pdfs",recursive=True)
documents = reader.load_data()

In [3]:
from llama_index.core.node_parser import TokenTextSplitter
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=128,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(
    documents, show_progress=True
)

Parsing nodes:   0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
nodes

[TextNode(id_='f5554345-4ae4-47a8-93e0-0eadd4c2d2c9', embedding=None, metadata={'page_label': '1', 'file_name': 'article_20160531140556.pdf', 'file_path': '/Users/grizzlystudio/RAG-with-Ollama/pdfs/article_20160531140556.pdf', 'file_type': 'application/pdf', 'file_size': 301354, 'creation_date': '2024-07-27', 'last_modified_date': '2024-07-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c343a087-b5c5-448f-9564-71236ead5719', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'article_20160531140556.pdf', 'file_path': '/Users/grizzlystudio/RAG-with-Ollama/pdfs/article_20160531140556.pdf', 'file_type': 'application/pdf', 'file_size': 301354, 'creation_date': '2024-07-27', 'l

In [5]:
from llama_index.embeddings.ollama import OllamaEmbedding
embed_model = OllamaEmbedding(model_name="openthaigpt")

In [6]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


In [7]:
embeddings = embed_model.get_text_embedding("box")
dim = len(embeddings)

In [9]:
dim

4096

In [11]:
import nest_asyncio
nest_asyncio.apply()

In [12]:
from os import getenv
from llama_index.vector_stores.opensearch import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)

# http endpoint for your cluster (opensearch required for vector index usage)
endpoint = getenv("OPENSEARCH_ENDPOINT", "http://localhost:9200")
# index to demonstrate the VectorStore impl
idx = getenv("OPENSEARCH_INDEX", "hybrid_pdf_index")

# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "embedding"
# OpensearchVectorClient encapsulates logic for a
# single opensearch index with vector search enabled with hybrid search pipeline
client = OpensearchVectorClient(
    endpoint=endpoint,
    index=idx,
    dim=dim,
    embedding_field=embedding_field,
    text_field=text_field,
    search_pipeline="hybrid-search-pipeline",
)

# initialize vector store
vector_store = OpensearchVectorStore(client)

In [13]:
from llama_index.core import VectorStoreIndex, StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
    nodes, storage_context=storage_context, embed_model=embed_model
)

In [14]:
! curl -XGET http://localhost:9200/hybrid_pdf_index/_mapping | jq .

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1205  100  1205    0     0  35141      0 --:--:-- --:--:-- --:--:-- 35441
[1;39m{
  [0m[1;34m"hybrid_pdf_index"[0m[1;39m: [0m[1;39m{
    [0m[1;34m"mappings"[0m[1;39m: [0m[1;39m{
      [0m[1;34m"properties"[0m[1;39m: [0m[1;39m{
        [0m[1;34m"content"[0m[1;39m: [0m[1;39m{
          [0m[1;34m"type"[0m[1;39m: [0m[0;32m"text"[0m[1;39m,
          [0m[1;34m"fields"[0m[1;39m: [0m[1;39m{
            [0m[1;34m"keyword"[0m[1;39m: [0m[1;39m{
              [0m[1;34m"type"[0m[1;39m: [0m[0;32m"keyword"[0m[1;39m,
              [0m[1;34m"ignore_above"[0m[1;39m: [0m[0;39m256[0m[1;39m
            [1;39m}[0m[1;39m
          [1;39m}[0m[1;39m
        [1;39m}[0m[1;39m,
        [0m[1;34m"embedding"[0m[1;39m: [0m[1;39m{
          [0m[1;34m"type"[0m[1;39m: [0m[0;32m"

In [15]:
from llama_index.core.vector_stores.types import VectorStoreQueryMode
retriever = index.as_retriever(similarity_top_k=3,vector_store_query_mode=VectorStoreQueryMode.HYBRID)
text_retriveve = "ที่มาของปัญหาจราจร"
prompt = retriever.retrieve(text_retriveve)
for r in prompt:
    print(r.metadata)
    print(r)

{'page_label': '6', 'file_name': 'article_20160531140556.pdf', 'file_path': '/Users/grizzlystudio/RAG-with-Ollama/pdfs/article_20160531140556.pdf', 'file_type': 'application/pdf', 'file_size': 301354, 'creation_date': '2024-07-27', 'last_modified_date': '2024-07-27'}
Node ID: 6cb67498-3874-48a3-babc-da2034c48de5
Text: หรือด้วยวิธีอื่นใด ส าหรับให้ผู้ขับขี่ คนเดินเท้า หรือคนจูง ขี่
หรือ ไล่ต้อนสัตว์ ปฏิบัติตาม สัญญาณนั้น เช่น สัญญาณไฟเขียว ไฟแดง
เป็นต้น   (๒) เครื่องหมายจราจร หมายความว่า เครื่องหมายใด ๆ ที่ได้ติด
ตั้งไว้หรือท าให้ปรากฏ ในทาง เสียส าหรับให้ผู้ขับขี่ คนเดินเท้า
หรือคนที่จูง ขี่ หรือไล่ต้อนสัตว์  ปฏิบัติตามเครื่องหมายนั้น เช่น
เครื่องหมาย   ห้ามเ...
Score:  1.000

{'page_label': '9', 'file_name': 'article_20160531140556.pdf', 'file_path': '/Users/grizzlystudio/RAG-with-Ollama/pdfs/article_20160531140556.pdf', 'file_type': 'application/pdf', 'file_size': 301354, 'creation_date': '2024-07-27', 'last_modified_date': '2024-07-27'}
Node ID: 08e0899f-a783-4626-a233-7231154cc53f
