## Load environment variable

In [2]:
# Load environments variables
from dotenv import load_dotenv
load_dotenv("../.streamlit/secrets.toml") 

True

## Get database information using langchain.SQLDatabase
Snowflake Database has 43 tables 

In [5]:
import os
from snowflake.sqlalchemy import URL
from langchain import OpenAI, SQLDatabase, SQLDatabaseChain

# create snowflake connection uri
uri_snow = URL(
    account=os.getenv("account"),
    user=os.getenv("user"),
    password=os.getenv("password"),
    database=os.getenv("database"),
    schema=os.getenv("schema"),
    warehouse=os.getenv("warehouse"),
    role=os.getenv("role"),
)

# generate prompt 2 tables
tables = ["ecdc_global", "goog_global_mobility_report", "databank_demographics"]
db = SQLDatabase.from_uri(uri_snow, include_tables=tables)
# sample all tables
# db = SQLDatabase.from_uri(uri_snow)

In [7]:
# get DDL and 3 rows samples for every table
db_info = db.table_info
db_info

'\nCREATE TABLE databank_demographics (\n\tiso3166_1 VARCHAR(16777216), \n\tiso3166_2 VARCHAR(16777216), \n\tfips VARCHAR(16777216), \n\tlatitude FLOAT, \n\tlongitude FLOAT, \n\tstate VARCHAR(16777216), \n\tcounty VARCHAR(16777216), \n\ttotal_population DECIMAL(38, 0), \n\ttotal_male_population DECIMAL(38, 0), \n\ttotal_female_population DECIMAL(38, 0), \n\tcountry_region VARCHAR(250)\n)\n\n/*\n3 rows from databank_demographics table:\niso3166_1\tiso3166_2\tfips\tlatitude\tlongitude\tstate\tcounty\ttotal_population\ttotal_male_population\ttotal_female_population\tcountry_region\nAF\tNone\tNone\t33.0\t65.0\tNone\tNone\t37172386\t19093281\t18079105\tAfghanistan\nAL\tNone\tNone\t41.0\t20.0\tNone\tNone\t2866376\t1460043\t1406333\tAlbania\nDZ\tNone\tNone\t28.0\t3.0\tNone\tNone\t42228429\t21332000\t20896429\tAlgeria\n*/\n\n\nCREATE TABLE ecdc_global (\n\tcountry_region VARCHAR(16777216), \n\tcontinentexp VARCHAR(16777216), \n\tiso3166_1 VARCHAR(2), \n\tcases FLOAT, \n\tdeaths FLOAT, \n\tcase

In [8]:
# save database info into
with open("../docs/database_info.txt", "w") as fp:
    fp.write(db_info)

In [9]:
# read file
with open("../docs/database_info.txt", "r") as fp:
    text_file = fp.read()

### Create langchain documents from tables DDL 

In [10]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(        
    separator = "\n\n\n",
    chunk_size = 0,
    chunk_overlap = 0,
    length_function = len
)

# Split document
# texts_split = text_file.split("\n\n\n")
texts_split = text_splitter.split_text(text_file)
metadatas = [ {"document": "database_info.txt"} for _ in texts_split]
docs = text_splitter.create_documents(texts_split, metadatas)
docs

Created a chunk of size 752, which is longer than the specified 0
Created a chunk of size 769, which is longer than the specified 0


[Document(page_content='CREATE TABLE databank_demographics (\n\tiso3166_1 VARCHAR(16777216), \n\tiso3166_2 VARCHAR(16777216), \n\tfips VARCHAR(16777216), \n\tlatitude FLOAT, \n\tlongitude FLOAT, \n\tstate VARCHAR(16777216), \n\tcounty VARCHAR(16777216), \n\ttotal_population DECIMAL(38, 0), \n\ttotal_male_population DECIMAL(38, 0), \n\ttotal_female_population DECIMAL(38, 0), \n\tcountry_region VARCHAR(250)\n)\n\n/*\n3 rows from databank_demographics table:\niso3166_1\tiso3166_2\tfips\tlatitude\tlongitude\tstate\tcounty\ttotal_population\ttotal_male_population\ttotal_female_population\tcountry_region\nAF\tNone\tNone\t33.0\t65.0\tNone\tNone\t37172386\t19093281\t18079105\tAfghanistan\nAL\tNone\tNone\t41.0\t20.0\tNone\tNone\t2866376\t1460043\t1406333\tAlbania\nDZ\tNone\tNone\t28.0\t3.0\tNone\tNone\t42228429\t21332000\t20896429\tAlgeria\n*/', metadata={'document': 'database_info.txt'}),
 Document(page_content='CREATE TABLE ecdc_global (\n\tcountry_region VARCHAR(16777216), \n\tcontinentexp V

### Create embeddings and save database in local using Chroma and OpenAIEmbeddings
- Generate the embeddings using openAI with documents from above cell 
- Create the database in Chroma (local)

In [13]:
# import Chroma Library that allow to store vector database in local
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# remove folder and avoid conflicts
! rmdir /s /q "../chroma_db"

# create object for embeddings using OpenAI
embeddings = OpenAIEmbeddings()

# create database and save embeddings in local
vector_store = Chroma.from_documents(docs, embeddings, persist_directory="../chroma_db")
vector_store.persist()

# get first embedding in database 
# vector_store.get(limit=1, include=['embeddings', 'documents', 'metadatas'])
vector_store.get(limit=1)

{'ids': ['890c1745-2682-11ee-a667-ac7ed0d21e7b'],
 'embeddings': None,
 'documents': ['CREATE TABLE databank_demographics (\n\tiso3166_1 VARCHAR(16777216), \n\tiso3166_2 VARCHAR(16777216), \n\tfips VARCHAR(16777216), \n\tlatitude FLOAT, \n\tlongitude FLOAT, \n\tstate VARCHAR(16777216), \n\tcounty VARCHAR(16777216), \n\ttotal_population DECIMAL(38, 0), \n\ttotal_male_population DECIMAL(38, 0), \n\ttotal_female_population DECIMAL(38, 0), \n\tcountry_region VARCHAR(250)\n)\n\n/*\n3 rows from databank_demographics table:\niso3166_1\tiso3166_2\tfips\tlatitude\tlongitude\tstate\tcounty\ttotal_population\ttotal_male_population\ttotal_female_population\tcountry_region\nAF\tNone\tNone\t33.0\t65.0\tNone\tNone\t37172386\t19093281\t18079105\tAfghanistan\nAL\tNone\tNone\t41.0\t20.0\tNone\tNone\t2866376\t1460043\t1406333\tAlbania\nDZ\tNone\tNone\t28.0\t3.0\tNone\tNone\t42228429\t21332000\t20896429\tAlgeria\n*/'],
 'metadatas': [{'document': 'database_info.txt'}]}

In [14]:
vector_store._collection.count()

3

In [17]:
# load from disk 
vector_store = Chroma(persist_directory="../chroma_db", embedding_function=embeddings)
docs = vector_store.similarity_search("which table contains the names of countries")
docs

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


[Document(page_content='CREATE TABLE databank_demographics (\n\tiso3166_1 VARCHAR(16777216), \n\tiso3166_2 VARCHAR(16777216), \n\tfips VARCHAR(16777216), \n\tlatitude FLOAT, \n\tlongitude FLOAT, \n\tstate VARCHAR(16777216), \n\tcounty VARCHAR(16777216), \n\ttotal_population DECIMAL(38, 0), \n\ttotal_male_population DECIMAL(38, 0), \n\ttotal_female_population DECIMAL(38, 0), \n\tcountry_region VARCHAR(250)\n)\n\n/*\n3 rows from databank_demographics table:\niso3166_1\tiso3166_2\tfips\tlatitude\tlongitude\tstate\tcounty\ttotal_population\ttotal_male_population\ttotal_female_population\tcountry_region\nAF\tNone\tNone\t33.0\t65.0\tNone\tNone\t37172386\t19093281\t18079105\tAfghanistan\nAL\tNone\tNone\t41.0\t20.0\tNone\tNone\t2866376\t1460043\t1406333\tAlbania\nDZ\tNone\tNone\t28.0\t3.0\tNone\tNone\t42228429\t21332000\t20896429\tAlgeria\n*/', metadata={'document': 'database_info.txt'}),
 Document(page_content='CREATE TABLE ecdc_global (\n\tcountry_region VARCHAR(16777216), \n\tcontinentexp V

In [18]:
# search document which better match from question
vector_store.similarity_search("which table country has more deaths")

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


[Document(page_content='CREATE TABLE ecdc_global (\n\tcountry_region VARCHAR(16777216), \n\tcontinentexp VARCHAR(16777216), \n\tiso3166_1 VARCHAR(2), \n\tcases FLOAT, \n\tdeaths FLOAT, \n\tcases_since_prev_day FLOAT, \n\tdeaths_since_prev_day FLOAT, \n\tpopulation FLOAT, \n\tdate DATE, \n\tlast_update_date TIMESTAMP_NTZ, \n\tlast_reported_flag BOOLEAN\n)\n\n/*\n3 rows from ecdc_global table:\ncountry_region\tcontinentexp\tiso3166_1\tcases\tdeaths\tcases_since_prev_day\tdeaths_since_prev_day\tpopulation\tdate\tlast_update_date\tlast_reported_flag\nAlbania\tEurope\tAL\t788.0\t14.0\t0.0\t0.0\t2862427.0\t2020-12-14\t2023-07-19 00:02:41.691935\tTrue\nAlbania\tEurope\tAL\t879.0\t12.0\t91.0\t-2.0\t2862427.0\t2020-12-13\t2023-07-19 00:02:41.691935\tFalse\nAlbania\tEurope\tAL\t802.0\t12.0\t-77.0\t0.0\t2862427.0\t2020-12-12\t2023-07-19 00:02:41.691935\tFalse\n*/', metadata={'document': 'database_info.txt'}),
 Document(page_content='CREATE TABLE databank_demographics (\n\tiso3166_1 VARCHAR(167772

# Create chat

In [19]:
from langchain.prompts.prompt import PromptTemplate

template_questions = """Considering the provided chat history and a subsequent question, rewrite the follow-up question to be an independent query. Alternatively, conclude the conversation if it appears to be complete.
Chat History:\"""
{chat_history}
\"""
Follow Up Input: \"""
{question}
\"""
Standalone question:"""


template_qa = """ 
You're an AI assistant specializing in data analysis with Snowflake SQL. When providing responses, strive to exhibit friendliness and adopt a conversational tone, similar to how a friend or tutor would communicate.
When asked about your capabilities, provide a general overview of your ability to assist with data analysis tasks using Snowflake SQL, instead of performing specific SQL queries. 
Based on the question provided, if it pertains to data analysis or SQL tasks, generate SQL code that is compatible with the Snowflake environment. Additionally, offer a brief explanation about how you arrived at the SQL code. If the required column isn't explicitly stated in the context, suggest an alternative using available columns, but do not assume the existence of any columns that are not mentioned. Also, do not modify the database in any way (no insert, update, or delete operations). You are only allowed to query the database. Refrain from using the information schema.
If the question or context does not clearly involve SQL or data analysis tasks, respond appropriately without generating SQL queries. 
When the user expresses gratitude or says "Thanks", interpret it as a signal to conclude the conversation. Respond with an appropriate closing statement without generating further SQL queries.
If you don't know the answer, simply state, "I'm sorry, I don't know the answer to your question."
Write your response in markdown format.

Question: ```{question}```
{context}

Answer:
"""

condense_question_prompt = PromptTemplate.from_template(template_questions)
prompt_qa = PromptTemplate(template=template_qa, input_variables=["question", "context"])

In [20]:
condense_question_prompt

PromptTemplate(input_variables=['chat_history', 'question'], output_parser=None, partial_variables={}, template='Considering the provided chat history and a subsequent question, rewrite the follow-up question to be an independent query. Alternatively, conclude the conversation if it appears to be complete.\nChat History:"""\n{chat_history}\n"""\nFollow Up Input: """\n{question}\n"""\nStandalone question:', template_format='f-string', validate_template=True)

In [21]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.question_answering import load_qa_chain

q_llm = ChatOpenAI(
    model_name="gpt-3.5-turbo-16k",
    temperature=0.1,
    max_tokens=500
)

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0.5,
    max_tokens=500,
    # streaming=True,
)

question_generator = LLMChain(llm=q_llm, prompt=condense_question_prompt)
doc_chain = load_qa_chain(llm=llm, chain_type="stuff", prompt=prompt_qa)

conv_chain = ConversationalRetrievalChain(
    retriever=vector_store.as_retriever(),
    combine_docs_chain=doc_chain,
    question_generator=question_generator
)

In [22]:
chat_history = []
question = """Now to get started, please briefly introduce yourself, describe the database at a high level. Then provide 3 example questions using bullet points. this reponse without query. Write your response in markdown format."""
result = conv_chain(
            {"question": question, "chat_history": chat_history}
        )
answer = result["answer"]

# store the response in chat history
chat_history = [(question, answer)]

# show answer
print(answer)

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Hello! I'm an AI assistant specializing in data analysis with Snowflake SQL. I can help you with various data analysis tasks using Snowflake SQL. Here's a brief overview of my capabilities:

- I can retrieve data from tables in the database and perform various data manipulation operations such as filtering, sorting, and aggregating.
- I can join tables together based on common columns to combine data from multiple sources.
- I can perform calculations and transformations on the data using SQL functions.
- I can generate reports and visualizations based on the data.

Now, let's move on to the example questions:

1. What is the total population of each country in the "databank_demographics" table?
2. How many cases and deaths were reported in each country on a specific date in the "ecdc_global" table?
3. How did the mobility change for different categories (grocery and pharmacy, parks, residential, etc.) in a specific region on different dates in the "goog_global_mobility_report" table?


In [23]:
result = conv_chain(
            {"question": "What is the total number of COVID-19 cases and deaths for each country in the `ecdc_global` table?", "chat_history": []}
        )
answer = result["answer"]
print(answer)


Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


To obtain the total number of COVID-19 cases and deaths for each country in the `ecdc_global` table, you can use the following SQL code:

```sql
SELECT country_region, SUM(cases) AS total_cases, SUM(deaths) AS total_deaths
FROM ecdc_global
GROUP BY country_region;
```

This code uses the `GROUP BY` clause to group the data by the `country_region` column. Then, it calculates the sum of the `cases` and `deaths` columns for each group using the `SUM` function. The result will include the `country_region`, `total_cases`, and `total_deaths` columns.

Please note that this code assumes that the `ecdc_global` table contains the relevant data and columns mentioned in the question.
