In [2]:

# perpare vector database from pdf
# sample from comments-energy.csv file to test
from pyprojroot import here
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import os
import yaml
import sys
sys.path.append(str(here()))
from src.utils.prepare_vectordb_from_pdf import PrepareVectorDB
print("Environment variables are loaded:", load_dotenv())

data_dir = Path(here("Data"))

if __name__ == "__main__":
    os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

    with open(here("configs/app_config.yml")) as cfg:
        app_config = yaml.load(cfg, Loader=yaml.FullLoader)

    # Uncomment the following configs to run for swiss airline policy document
    chunk_size = app_config["content_table_rag_config"]["chunk_size"]
    chunk_overlap = app_config["content_table_rag_config"]["chunk_overlap"]
    embedding_model = app_config["content_table_rag_config"]["embedding_model"]
    vectordb_dir = app_config["content_table_rag_config"]["vectordb"]
    collection_name = app_config["content_table_rag_config"]["collection_name"]
    doc_dir = app_config["content_table_rag_config"]["unstructured_docs"]

    prepare_db_instance = PrepareVectorDB(
        doc_dir=doc_dir,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        embedding_model=embedding_model,
        vectordb_dir=vectordb_dir,
        collection_name=collection_name)

    prepare_db_instance.run()


Environment variables are loaded: True
Directory 'data/table_definitions_vectordb' was created.
VectorDB is created and saved.
Number of vectors in vectordb: 96 




In [7]:
collection_name

'table-definitions-rag-chroma'

In [82]:
from openai import OpenAI
import os
import chromadb
# Set the OpenAI API key
model_name = "gpt-3.5-turbo"
openai_api_key = os.environ["OPENAI_API_KEY"]

# instantiate the OpenAI client
client = OpenAI()

In [83]:
from src.utils.load_config import LoadConfig
cfg = LoadConfig()
print("tenant:", getattr(cfg.chroma_client, "tenant", None))
print("database:", getattr(cfg.chroma_client, "database", None))

collections = cfg.chroma_client.list_collections()
print("collections:", collections)
print("Available collections:")
for c in collections:
    print("-", c)

tenant: default_tenant
database: default_database
collections: ['comments_energy']
Available collections:
- comments_energy


In [84]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

with open(here("configs/app_config.yml")) as cfg:
    app_config = yaml.load(cfg, Loader=yaml.FullLoader)

chunk_size = app_config["content_table_rag_config"]["chunk_size"]
chunk_overlap = app_config["content_table_rag_config"]["chunk_overlap"]
embedding_model = app_config["content_table_rag_config"]["embedding_model"]
vectordb_dir = app_config["content_table_rag_config"]["vectordb"]
collection_name = app_config["content_table_rag_config"]["collection_name"]

vectordb = Chroma(
    collection_name=collection_name,
    persist_directory=str(here(vectordb_dir)),
    embedding_function=OpenAIEmbeddings(model=embedding_model)
)




In [85]:
results = vectordb.similarity_search("your query here", k=3)

In [86]:
# create query embedding using langchain_openai
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
query = "Which table contains information about material applications?"
query_embedding = embedding_model.embed_query(query)
print("Query embedding:", query_embedding)

Query embedding: [-0.026560219004750252, 0.06038150563836098, 0.06665986776351929, -0.023325495421886444, -0.008298362605273724, -0.012304232455790043, 0.01295254286378622, -0.01910807192325592, -0.025481978431344032, -0.01538199745118618, -0.02075955457985401, -0.07217391580343246, 0.0031579500064253807, 0.008441672660410404, 0.01779780350625515, 0.02160576917231083, 0.026846839115023613, -0.009765589609742165, -0.04711504280567169, 0.0164602380245924, 0.012700042687356472, 0.02178320102393627, -0.04847990721464157, -0.013293758034706116, -0.0012573793064802885, 0.0003245812840759754, 0.01737469621002674, -0.014576728455722332, -0.04736071825027466, 0.020964283496141434, 0.025004275143146515, -0.021714957430958748, 0.000528884120285511, 0.028907781466841698, 0.01774320937693119, -0.02740643359720707, 0.008489442989230156, 0.01771591231226921, -0.011847004294395447, -0.019490232691168785, -0.020773202180862427, 0.0694441869854927, 0.0004254531522747129, 0.02948102355003357, -0.04599585

In [87]:
# Create query embedding using OpenAI
# instantiate the OpenAI client
client = OpenAI()
query_texts = "Which table contains information about material applications?"
response = client.embeddings.create(
        input = query_texts,
        model= "text-embedding-3-small"
    )
query_embeddings = response.data[0].embedding
print("Query embedding:", query_embedding)

Query embedding: [-0.026560219004750252, 0.06038150563836098, 0.06665986776351929, -0.023325495421886444, -0.008298362605273724, -0.012304232455790043, 0.01295254286378622, -0.01910807192325592, -0.025481978431344032, -0.01538199745118618, -0.02075955457985401, -0.07217391580343246, 0.0031579500064253807, 0.008441672660410404, 0.01779780350625515, 0.02160576917231083, 0.026846839115023613, -0.009765589609742165, -0.04711504280567169, 0.0164602380245924, 0.012700042687356472, 0.02178320102393627, -0.04847990721464157, -0.013293758034706116, -0.0012573793064802885, 0.0003245812840759754, 0.01737469621002674, -0.014576728455722332, -0.04736071825027466, 0.020964283496141434, 0.025004275143146515, -0.021714957430958748, 0.000528884120285511, 0.028907781466841698, 0.01774320937693119, -0.02740643359720707, 0.008489442989230156, 0.01771591231226921, -0.011847004294395447, -0.019490232691168785, -0.020773202180862427, 0.0694441869854927, 0.0004254531522747129, 0.02948102355003357, -0.04599585

In [88]:
results = vectordb.similarity_search_by_vector(
    embedding= query_embeddings, k = 3
)

print(results[0])

page_content='7  
 
 
ICDD PDF Database Table Definitions 
This file contains ICDD database table documentation for the current PDF2021, PLU2021, 
PLW2021, MIN2021, ORG2021 and AXM2021 Powder Diffraction File databases. The field 
“ProductID” with 9 characters in length has the PDF ID numbers used for all ICDD product 
releases. Only tables documented below will be supported in the future. Developers/Vendors 
will be notified in the future if there are changes in the DB tables. The “dba.tbl_Spacings” table 
contains data for the d-spacings, intensities, and Miller indexes. The 
“dba.tbl_Strong8_Spacings” table contains data for the 10 strongest d-spacings in descending 
intensity order and in descending interplanner spacing order. The “dba.tbl_Strong8_Spacings” 
table also contains data for the 10 largest d-spacings in descending interplanner spacing order. 
Tables “dba.tbl_LPF_Struc” and “dba.tbl_LPF_Coordinates” contain atomic coordinates and 
site crystal structure information (indi

In [89]:
### Pass the results to the OpenAI LLM
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}]

In [90]:
response = client.chat.completions.create(
    model=os.getenv("gpt_deployment_name"),
    messages=messages
)

In [91]:
response.choices[0].message.content

'The table that contains information about material applications is "tbl_Cform_Search" which includes the field "EmpericalFormula" as an nvarchar.'

# Testing a few questions

In [101]:
# create query embedding using langchain_openai
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
query = "Which table contains data on d-spacings?"
query_embedding = embedding_model.embed_query(query)
print("Query embedding:", query_embedding)

Query embedding: [-0.013506446033716202, -0.0017521358095109463, 0.10774518549442291, -0.018153276294469833, 0.009810684248805046, 0.054689615964889526, 0.015587306581437588, -0.010895795188844204, -0.015485178679227829, -0.010212813504040241, -0.027140552178025246, -0.01598305255174637, -0.007697908207774162, 0.0008194186957553029, 0.031046953052282333, 0.025838417932391167, 0.002026605186983943, 0.024268198758363724, 0.005195769015699625, 0.04920022934675217, -0.026655443012714386, -0.006481945049017668, -0.005725558381527662, -0.011023455299437046, 0.00974047090858221, 0.03541292995214462, 0.011080903001129627, -0.007078756578266621, 0.037327833473682404, 0.0021702228114008904, 0.02867247350513935, -0.031659722328186035, 0.00820216629654169, 0.007672375999391079, 0.026808636263012886, 0.0564257949590683, -0.01256176084280014, 0.06903862208127975, -0.028289493173360825, -0.00998302549123764, -0.00633194437250495, 0.03306398540735245, -0.04462999477982521, -0.004851086530834436, 0.014

In [102]:
results = vectordb.similarity_search_by_vector(
    embedding= query_embeddings, k = 3
)

print(results[0])

page_content='7  
 
 
ICDD PDF Database Table Definitions 
This file contains ICDD database table documentation for the current PDF2021, PLU2021, 
PLW2021, MIN2021, ORG2021 and AXM2021 Powder Diffraction File databases. The field 
“ProductID” with 9 characters in length has the PDF ID numbers used for all ICDD product 
releases. Only tables documented below will be supported in the future. Developers/Vendors 
will be notified in the future if there are changes in the DB tables. The “dba.tbl_Spacings” table 
contains data for the d-spacings, intensities, and Miller indexes. The 
“dba.tbl_Strong8_Spacings” table contains data for the 10 strongest d-spacings in descending 
intensity order and in descending interplanner spacing order. The “dba.tbl_Strong8_Spacings” 
table also contains data for the 10 largest d-spacings in descending interplanner spacing order. 
Tables “dba.tbl_LPF_Struc” and “dba.tbl_LPF_Coordinates” contain atomic coordinates and 
site crystal structure information (indi

In [104]:
### Pass the results to the OpenAI LLM
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}]
        
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages
)

response.choices[0].message.content

'The table that contains information about material applications is the "tbl_Cform_Search" table.'

In [105]:
# create query embedding using langchain_openai
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
query = "What does the table tbl_CFORM_Search include?"
query_embedding = embedding_model.embed_query(query)
print("Query embedding:", query_embedding)

Query embedding: [0.004489387851208448, 0.057177040725946426, 0.05494936183094978, -0.010643345303833485, -0.007487469352781773, 0.01725211925804615, -0.009368618950247765, 0.017115984112024307, -0.013081413693726063, 0.010402013547718525, -0.005627978127449751, -0.03819228336215019, 0.016905592754483223, 0.03735071420669556, 0.01576700247824192, 0.017549144104123116, 0.04074173420667648, 0.020692642778158188, 0.020531754940748215, 0.04576638340950012, 0.05128607153892517, -0.033241890370845795, -0.04277139529585838, 0.001227542757987976, -0.0345042385160923, -0.0334894098341465, -0.00938099529594183, -0.010550525039434433, -0.013378437608480453, 0.016608569771051407, 0.01867535710334778, -0.031038964167237282, -0.026732122525572777, -0.0014526309678331017, -0.0269548911601305, -0.012796766124665737, 0.032821107655763626, 0.030345909297466278, 0.0013211361365392804, -0.01827932707965374, -0.04257338121533394, 0.04962769150733948, 0.03096470795571804, -0.020581258460879326, -0.024331182

In [109]:
results = vectordb.similarity_search_by_vector(
    embedding= query_embeddings, k = 3
)

print(results[0])

page_content='7  
 
 
ICDD PDF Database Table Definitions 
This file contains ICDD database table documentation for the current PDF2021, PLU2021, 
PLW2021, MIN2021, ORG2021 and AXM2021 Powder Diffraction File databases. The field 
“ProductID” with 9 characters in length has the PDF ID numbers used for all ICDD product 
releases. Only tables documented below will be supported in the future. Developers/Vendors 
will be notified in the future if there are changes in the DB tables. The “dba.tbl_Spacings” table 
contains data for the d-spacings, intensities, and Miller indexes. The 
“dba.tbl_Strong8_Spacings” table contains data for the 10 strongest d-spacings in descending 
intensity order and in descending interplanner spacing order. The “dba.tbl_Strong8_Spacings” 
table also contains data for the 10 largest d-spacings in descending interplanner spacing order. 
Tables “dba.tbl_LPF_Struc” and “dba.tbl_LPF_Coordinates” contain atomic coordinates and 
site crystal structure information (indi

In [112]:
### Pass the results to the OpenAI LLM
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}]
        
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages
)

response.choices[0].message.content

'The table that contains information about material applications is called "dba.tbl_CForm_Search". It includes data related to physical, chemical, and crystallographic properties.'

In [113]:
### the terms in the document is very different from normal english which the model is trained on
### we need to use a fine-tuned model for this task if we want to get the correct answer

In [114]:
### Pass the results to the OpenAI LLM
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}]
        
response = client.chat.completions.create(
    model="gpt-4",
    messages=messages
)

response.choices[0].message.content

'The documents do not provide specific information on which table contains information about material applications.'