In [4]:
import os
import getpass
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.readers.google import GoogleDriveReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.cohere import CohereEmbedding
from llama_parse import LlamaParse
from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
from llama_index.core import VectorStoreIndex, QueryBundle, Response, Document, Settings
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.embeddings import resolve_embed_model
from dotenv import load_dotenv

# llm = Cohere(model="command-nightly", api_key=cohere_api_key)

  from .autonotebook import tqdm as notebook_tqdm


# Parse

In [5]:
node_parser = MarkdownNodeParser()

In [20]:
load_dotenv()

api_keys = os.getenv("LLAMA_INDEX_KEYS")

In [21]:
documents = []

parser = LlamaParse(
    api_key=api_keys[0],
    result_type="markdown",
    verbose=True,
    language="en",
    num_workers=2,
)

In [None]:
def process_company_folder(company_folder_path):
    pdf_paths =[]
    for pdf_file in os.listdir(company_folder_path):
        pdf_path = os.path.join(company_folder_path, pdf_file)
        pdf_paths.append(pdf_path)
    for api_key in api_keys:
        try:
            parser = LlamaParse(
                api_key=api_key,
                result_type="markdown",
                verbose=True,
                language="en",
                num_workers=2,
            )
            loaded_docs = parser.load_data(pdf_paths)
            break
        except:
            continue
    documents.append(loaded_docs)

In [None]:
# Iterate and process company folders
file_path = 'Data'
for company in os.listdir(file_path):
    if company == '.DS_Store':
        continue
    company_folder_path = os.path.join(file_path, company)
    process_company_folder(company_folder_path)

In [None]:
company_nodes = []
for company in documents:
    nodes = node_parser.get_nodes_from_documents(company)
    company_nodes.append(nodes)

company_nodes = [item for company_node in company_nodes for item in company_node]

# Batch, Embed, Store/Rank

In [23]:
ELASTIC_CLOUD_ID = os.getenv("ELASTIC_CLOUD_ID")
ELASTIC_API_KEY = os.getenv("ELASTIC_API_KEY")

In [None]:
es_vector_store = ElasticsearchStore(index_name="calls",
                                     vector_field='conversation_vector',
                                     text_field='conversation',
                                     es_cloud_id=ELASTIC_CLOUD_ID,
                                     es_api_key=ELASTIC_API_KEY)

ollama_embedding = OllamaEmbedding("mistral")

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=350, chunk_overlap=50),
        ollama_embedding,
    ],
    vector_store=es_vector_store
)

pipeline.run(documents=company_nodes)
print(".....Done running pipeline.....\n")

In [None]:
es_vector_store = ElasticsearchStore(index_name="calls",
                                     vector_field='conversation_vector',
                                     text_field='conversation',
                                     es_cloud_id=ELASTIC_CLOUD_ID,
                                     es_api_key=ELASTIC_API_KEY)

ollama_embedding = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=1000, chunk_overlap=150),
        ollama_embedding,
    ],
    vector_store=es_vector_store
)

pipeline.run(documents=company_nodes)
print(".....Done running pipeline.....\n")

# Test/Run Q&A

In [None]:
# Local LLM to send user query to
local_llm = Ollama(model="llama3:instruct", request_timeout=60.0)
Settings.embed_model= resolve_embed_model("local:BAAI/bge-small-en-v1.5")

index = VectorStoreIndex.from_vector_store(es_vector_store)
query_engine = index.as_query_engine(local_llm, similarity_top_k=10)

query="How much was the quarterly cash dividend per share in the second quarter of 2022 for Google"
bundle = QueryBundle(query, embedding=Settings.embed_model.get_query_embedding(query))
result = query_engine.query(bundle)
print(result)