In [1]:
from omegaconf import OmegaConf
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())
import os
from pathlib import Path
import openai

openai.api_key = os.getenv("OPENA_AI_KEY")
# import weaviate

# from weaviate.auth import AuthApiKey
# from llama_index.vector_stores.milvus import MilvusVectorStore

from RAG.WO_notebooks.src.rag_utils import (
    load_data_to_sql_db,
    text_to_query_engine,
    setup_query_engines,
    get_retry_guideline_response,
    evaluate_and_transform_query,
    load_index_from_weaviate, )
# -----------------------------------------------------------------------------
# llama_index imports
# --------------------------------------------------------------------------------
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings


from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex

import chromadb

In [2]:
model_name = "gpt-3.5-turbo"
embedding_model_name = "text-embedding-3-large"
# embedding_model_name="local:BAAI/bge-small-en-v1.5"
llm = OpenAI(temperature=0.1, model=model_name)
embed_model = OpenAIEmbedding(model=embedding_model_name)
Settings.llm = llm
Settings.embed_model = embed_model

In [5]:
columns_to_embed = [
    "OriginalShorttext",
    "MajorSystem",
    "Part",
    "Action",
    "FM",
    "Location",
    "Comments",
    "FuncLocation",
]

columns_to_metadata = [
    "BscStartDate"
    "PMType",
    "Asset",
    "Cost",
    "RunningTime",
    "Variant",
    "SuspSugg",
    "Rule",
]
# path to the raw dataset
table_name = "work_order_table"
dbpath = "/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/embeddings/RAG/WO_notebooks/data/wo_data.db"
chromadbpath = "/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/embeddings/RAG/WO_notebooks/data/chroma_db"
wo_data_path = "/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/embeddings/RAG/WO_notebooks/data/excavator_2015_cleaned_forpdl.csv"


In [4]:
from llama_index.core import Document
from sqlalchemy import create_engine
import sqlite3
import pandas as pd

# Read the data from the CSV file into a pandas DataFrame
data = pd.read_csv(wo_data_path)

# Drop the 'Unnamed: 16' column from the DataFrame, if present
data.drop(columns=["Unnamed: 16"], errors='ignore', inplace=True)

# Create a connection to the SQLite database and an engine
conn = sqlite3.connect(dbpath)
engine = create_engine("sqlite:///" + dbpath)

# Write the DataFrame to a table in the SQLite database
data.to_sql(table_name, conn, if_exists='replace', index=False)

# Initialize an empty list to hold Document objects
docs = []

# Iterate over each row in the DataFrame to process columns for embedding and metadata
for _, row in data.iterrows():
    # Extract metadata from specified columns
    to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
    # Prepare text for embedding from specified columns
    values_to_embed = {k: str(row[k]) for k in columns_to_embed if k in row}
    to_embed = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
    # Create a new Document object with text and metadata
    newDoc = Document(text=to_embed, metadata=to_metadata)
    docs.append(newDoc)

# Combine text from all documents into a single Document object
document = Document(text="\n\n".join([doc.text for doc in docs]))

# Return the database connection, engine, DataFrame, and the combined document


In [8]:
db = chromadb.PersistentClient(path=chromadbpath)
chroma_collection = db.get_or_create_collection("WorkOrder")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    [document], storage_context=storage_context, embed_model=embed_model
)

# load from chroma
work_order_index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)

In [29]:
from llama_index.core import SimpleDirectoryReader

new_wo_path = "/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/embeddings/RAG/WO_notebooks/data/WO/"
documents = SimpleDirectoryReader(new_wo_path).load_data()

In [41]:

# chroma_client = chromadb.EphemeralClient()
# chroma_collection = chroma_client.create_collection("WorkOrder")
# set up ChromaVectorStore and load in data
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# work_order_index = VectorStoreIndex.from_documents(
#     [document], storage_context=storage_context, embed_model=embed_model)



# 
# vector_store = MilvusVectorStore(dim=1536, overwrite=True, collection_name="work_order_table")
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# work_order_index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context
# )


In [10]:
conn, engine, data, document = load_data_to_sql_db(wo_data_path, dbpath, table_name,
                                                   columns_to_embed, columns_to_metadata)
_, sql_database = text_to_query_engine(model_name, embedding_model_name, table_name, engine)

query_router_engine = setup_query_engines(sql_database, work_order_index, table_name)

In [11]:
query_string = """
How much did the work order 
 that result from BUCKET WON'T OPEN cost?"""
response = get_retry_guideline_response(
    query_router_engine, query_string, guideline=False)
print(response)

The work order that resulted from the issue of the BUCKET WON'T OPEN cost $183.05.


In [15]:
query_string = "How much in total did we spend on work order in 2004?"
response = get_retry_guideline_response(
    query_router_engine, query_string, guideline=False)
print(response)

The provided context information does not contain any details or references to work orders or expenses incurred in 2004.


In [14]:
data.loc[data.BscStartDate.str.startswith("2004")].Cost.sum()

465522.38