## Load environment variable

In [2]:
# Load environments variables
from dotenv import load_dotenv
load_dotenv("../.streamlit/secrets.toml")

True

In [3]:
# read file
with open("../docs/database_info.txt", "r") as fp:
    text_file = fp.read()

### Create langchain documents from tables DDL 

In [4]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(        
    separator = "\n\n\n",
    chunk_size = 0,
    chunk_overlap = 0,
    length_function = len
)

# Split document
# texts_split = text_file.split("\n\n\n")
texts_split = text_splitter.split_text(text_file)
metadatas = [ {"document": "database_info.txt"} for _ in texts_split]
docs = text_splitter.create_documents(texts_split, metadatas)
docs

Created a chunk of size 922, which is longer than the specified 0
Created a chunk of size 943, which is longer than the specified 0
Created a chunk of size 953, which is longer than the specified 0
Created a chunk of size 478, which is longer than the specified 0


[Document(page_content='CREATE TABLE contact (\n\tcontact_key DECIMAL(38, 0) NOT NULL, \n\tcontact_id VARCHAR(30), \n\tcontact_name VARCHAR(100), \n\tclient_id VARCHAR(30), \n\tclient_name VARCHAR(100), \n\ttier VARCHAR(10), \n\tregion VARCHAR(50), \n\tactive_flag VARCHAR(1), \n\tCONSTRAINT "SYS_CONSTRAINT_4bb18bf2-e5e8-4cf0-b4eb-ad7e20161aef" PRIMARY KEY (contact_key)\n)\n\n/* \nrelationships:\ntable contact has relationship on column contact.contact_key with table interaction.contact_key\ntable contact has relationship on column contact.contact_key with table priority_contact.contact_key\ntable contact has relationship on column contact.contact_key with table readership.contact_key\n*/\n\n/*\n3 rows from contact table:\ncontact_key\tcontact_id\tcontact_name\tclient_id\tclient_name\ttier\tregion\tactive_flag\n1\tCT101\tJohn Doe\tCL001\tABC Corp\tGold\tNorth America\tY\n2\tCT102\tJane Smith\tCL002\tXYZ Corp\tSilver\tEurope\tY\n3\tCT103\tMike Johnson\tCL003\t123 Industries\tBronze\tAsia

### Create embeddings and save database in local using Chroma and OpenAIEmbeddings
- Generate the embeddings using openAI with documents from above cell 
- Create the database in Chroma (local)

In [5]:
# import Chroma Library that allow to store vector database in local
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# remove folder and avoid conflicts
! rmdir /s /q "../chroma_db"

# create object for embeddings using OpenAI
embeddings = OpenAIEmbeddings()

# create database and save embeddings in local
vector_store = Chroma.from_documents(docs, embeddings, persist_directory="../chroma_db")
vector_store.persist()

# get first embedding in database 
# vector_store.get(limit=1, include=['embeddings', 'documents', 'metadatas'])
vector_store.get(limit=1)

{'ids': ['424ccd61-281d-11ee-a961-ac7ed0d21e7b'],
 'embeddings': None,
 'documents': ['CREATE TABLE contact (\n\tcontact_key DECIMAL(38, 0) NOT NULL, \n\tcontact_id VARCHAR(30), \n\tcontact_name VARCHAR(100), \n\tclient_id VARCHAR(30), \n\tclient_name VARCHAR(100), \n\ttier VARCHAR(10), \n\tregion VARCHAR(50), \n\tactive_flag VARCHAR(1), \n\tCONSTRAINT "SYS_CONSTRAINT_4bb18bf2-e5e8-4cf0-b4eb-ad7e20161aef" PRIMARY KEY (contact_key)\n)\n\n/* \nrelationships:\ntable contact has relationship on column contact.contact_key with table interaction.contact_key\ntable contact has relationship on column contact.contact_key with table priority_contact.contact_key\ntable contact has relationship on column contact.contact_key with table readership.contact_key\n*/\n\n/*\n3 rows from contact table:\ncontact_key\tcontact_id\tcontact_name\tclient_id\tclient_name\ttier\tregion\tactive_flag\n1\tCT101\tJohn Doe\tCL001\tABC Corp\tGold\tNorth America\tY\n2\tCT102\tJane Smith\tCL002\tXYZ Corp\tSilver\tEurope\

In [6]:
vector_store._collection.count()

5

In [7]:
# load from disk 
vector_store = Chroma(persist_directory="../chroma_db", embedding_function=embeddings)
docs = vector_store.similarity_search("which table contains the names of countries")
docs

[Document(page_content='CREATE TABLE employee (\n\temployee_key DECIMAL(38, 0) NOT NULL, \n\temp_ms_code VARCHAR(10), \n\tuser_code VARCHAR(20), \n\temployee_name VARCHAR(100), \n\tregion VARCHAR(50), \n\tactive_flag VARCHAR(1), \n\tbusiness_unit VARCHAR(50), \n\tproducer_flag VARCHAR(1), \n\tCONSTRAINT "SYS_CONSTRAINT_73534a84-ffab-479a-9b33-b415af0645d0" PRIMARY KEY (employee_key)\n)\n\n/* \nrelationships:\ntable employee has relationship on column employee.employee_key with table interaction.employee_key\ntable employee has relationship on column employee.employee_key with table priority_contact.employee_key\ntable employee has relationship on column employee.employee_key with table readership.employee_key\n*/\n\n/*\n3 rows from employee table:\nemployee_key\temp_ms_code\tuser_code\temployee_name\tregion\tactive_flag\tbusiness_unit\tproducer_flag\n1\tMS001\tUC001\tJohn Doe\tNorth America\tY\tSales\tY\n2\tMS002\tUC002\tJane Smith\tEurope\tY\tMarketing\tN\n3\tMS003\tUC003\tMike Johnso

In [8]:
# search document which better match from question
vector_store.similarity_search("readership per day")

[Document(page_content='CREATE TABLE readership (\n\treadership_key DECIMAL(38, 0) NOT NULL, \n\tdocument_key DECIMAL(38, 0) NOT NULL, \n\tcontact_key DECIMAL(38, 0) NOT NULL, \n\tdoc_read_datetime TIMESTAMP_NTZ, \n\tdocument_id VARCHAR(255), \n\theadline VARCHAR(255), \n\tpublished_date TIMESTAMP_NTZ, \n\tapplication_name VARCHAR(255), \n\treport_type VARCHAR(100), \n\temployee_key DECIMAL(38, 0), \n\tticker VARCHAR(20), \n\tdocument_region VARCHAR(200), \n\tCONSTRAINT "SYS_CONSTRAINT_d22921d3-f6c8-47ce-8e17-fa284b914c90" PRIMARY KEY (readership_key),\n\tFOREIGN KEY (contact_key) REFERENCES contact(contact_key),\n\tFOREIGN KEY (employee_key) REFERENCES employee(employee_key)\n)\n\n/*\n3 rows from readership table:\nreadership_key\tdocument_key\tcontact_key\tdoc_read_datetime\tdocument_id\theadline\tpublished_date\tapplication_name\treport_type\temployee_key\tticker\tdocument_region\n0\t6\t5\t2023-06-02 00:00:00\tDoc-890\tCloud Computing and Security\t2023-07-08 09:30:20\tNone\tRESEARC

# Create chat

In [9]:
from langchain.prompts.prompt import PromptTemplate

template_questions = """Considering the provided chat history and a subsequent question, rewrite the follow-up question to be an independent query. Alternatively, conclude the conversation if it appears to be complete.
Chat History:\"""
{chat_history}
\"""
Follow Up Input: \"""
{question}
\"""
Standalone question:"""


template_qa = """ 
You're an AI assistant specializing in data analysis with Snowflake SQL. When providing responses, strive to exhibit friendliness and adopt a conversational tone, similar to how a friend or tutor would communicate.
When asked about your capabilities, provide a general overview of your ability to assist with data analysis tasks using Snowflake SQL, instead of performing specific SQL queries. 
Based on the question provided, if it pertains to data analysis or SQL tasks, generate SQL code that is compatible with the Snowflake environment. Additionally, offer a brief explanation about how you arrived at the SQL code. If the required column isn't explicitly stated in the context, suggest an alternative using available columns, but do not assume the existence of any columns that are not mentioned. Also, do not modify the database in any way (no insert, update, or delete operations). You are only allowed to query the database. Refrain from using the information schema.
If the question or context does not clearly involve SQL or data analysis tasks, respond appropriately without generating SQL queries. 
When the user expresses gratitude or says "Thanks", interpret it as a signal to conclude the conversation. Respond with an appropriate closing statement without generating further SQL queries.
If you don't know the answer, simply state, "I'm sorry, I don't know the answer to your question."
Write your response in markdown format.

Question: ```{question}```
{context}

Answer:
"""

condense_question_prompt = PromptTemplate.from_template(template_questions)
prompt_qa = PromptTemplate(template=template_qa, input_variables=["question", "context"])

In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.question_answering import load_qa_chain

q_llm = ChatOpenAI(
    model_name="gpt-3.5-turbo-16k",
    temperature=0.1,
    max_tokens=500
)

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0.5,
    max_tokens=500,
    # streaming=True,
)

question_generator = LLMChain(llm=q_llm, prompt=condense_question_prompt)
doc_chain = load_qa_chain(llm=llm, chain_type="stuff", prompt=prompt_qa)

conv_chain = ConversationalRetrievalChain(
    retriever=vector_store.as_retriever(),
    combine_docs_chain=doc_chain,
    question_generator=question_generator
)

In [11]:
chat_history = []
question = """Now to get started, please briefly introduce yourself, describe the database at a high level. Then provide 3 example questions using bullet points. this reponse without query. Write your response in markdown format."""
result = conv_chain(
            {"question": question, "chat_history": chat_history}
        )
answer = result["answer"]

# store the response in chat history
chat_history = [(question, answer)]

# show answer
print(answer)

I am an AI assistant specializing in data analysis with Snowflake SQL. I can help you analyze data stored in the database by writing SQL queries in the Snowflake environment. Here is a brief overview of the database:

- The database consists of four tables: `contact`, `employee`, `interaction`, and `readership`.
- The `contact` table stores information about contacts such as their ID, name, client ID, client name, tier, region, and active flag.
- The `employee` table contains information about employees including their ID, MS code, user code, name, region, active flag, business unit, and producer flag.
- The `interaction` table records interactions between contacts and employees. It includes information such as interaction ID, start date, duration, region, live interaction flag, employee key, contact key, and interaction type.
- The `readership` table tracks document readership by contacts and employees. It stores data such as readership key, document key, contact key, read datetime, d

In [12]:
result = conv_chain(
            {"question": "top 5 contacts name who read most", "chat_history": []}
        )
answer = result["answer"]
print(answer)

To find the top 5 contacts who read the most, we can use the readership table to count the number of times each contact appears and then sort them in descending order. Here's the SQL code to achieve that:

```sql
SELECT c.contact_name, COUNT(*) AS read_count
FROM readership r
JOIN contact c ON r.contact_key = c.contact_key
GROUP BY c.contact_name
ORDER BY read_count DESC
LIMIT 5;
```

In this code, we join the readership table with the contact table using the contact_key column as the common column. We then group the result by the contact_name column and use the COUNT(*) function to count the number of occurrences for each contact. Finally, we sort the result in descending order by the read_count and limit the output to the top 5 contacts.

Please note that the readership table is joined with the contact table using the contact_key column, which is the foreign key in the readership table referencing the primary key in the contact table. The contact_name column is selected from the cont