##                                                    CCC - NLP to SQL generator

### Importing all the libraries

In [1]:
from dotenv import load_dotenv
import openai
import os
import langchain.llms as llms
from langchain.chains import create_sql_query_chain
import pandas as pd
import sqlite3
from langchain_community.utilities import SQLDatabase
from langchain.prompts import PromptTemplate
import openpyxl
from langchain.sql_database import SQLDatabase
from langchain_community.agent_toolkits import create_sql_agent
from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

In [2]:
working_dir = "C:/Users/nithi/OneDrive - University of Illinois Chicago/Documents/Capstone project/LLM project"
os.chdir(working_dir)

print(f"The current working directory: {os.getcwd()}")

The current working directory: C:\Users\nithi\OneDrive - University of Illinois Chicago\Documents\Capstone project\LLM project


### Importing and loading the csv file to SQLite Database

In [3]:
# Load CSV file
csv_file = 'dataset_flattened.csv'
df = pd.read_csv(csv_file)  

# Save to SQLite DB
db_file = 'dataset.db'  
conn = sqlite3.connect(db_file)
table_name = 'models'
df.to_sql(table_name, conn, if_exists='replace', index=False)
conn.close()

print(f"Data from '{csv_file}' has been successfully saved to '{db_file}' in the table '{table_name}'.")


Data from 'dataset_flattened.csv' has been successfully saved to 'dataset.db' in the table 'models'.


In [4]:
# SQLite database connection
db = SQLDatabase.from_uri("sqlite:///dataset.db")

# List all the available table names in the database
print("Table Names:", db.get_usable_table_names())

# Execute the query
db.run("SELECT count(*) from models")


Table Names: ['models']


'[(4797,)]'

### Developing a chain with sample example

In [5]:
# GPT connection
llm = ChatOpenAI(model="gpt-4", temperature=0.2)

#First Chain to take the user input and convert it to a SQL query
write_query = create_sql_query_chain(llm, db)
response = write_query.invoke({"question": "What is the volume of Model 1"})
response 

'SELECT "Daily_Volume" FROM models WHERE "Model_Name" = \'Model 1\' LIMIT 5'

In [6]:
#Second chain to excute the SQL queries
execute_query = QuerySQLDataBaseTool(db=db)

chain = write_query | execute_query
chain.invoke({"question": "What is the volume of Model 1"})


'[(1218,), (1873,), (1245,), (2014,), (4559,)]'

###  Fine tuning the model with basic prompt engineering for our use case

In [7]:
# Prompt to generate SQL queries
SQl_prompt = PromptTemplate.from_template(
    """You are a SQLite expert. Given an input question, create a syntactically correct SQLite query to run, to answer the input question.
Remember, your query should aim to answer the question with a single SQL statement and limit the results appropriately. You can order the results to return the most informative data in the database.
You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.
Never query for all columns from a table. Only query the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
Be careful not to query for columns that do not exist. Also, pay attention to which column is in which table.
Pay attention to use the date('now') function to get the current date if the question involves "today".

Only use the following tables:

{table_info}.

Question: {input}
Provide up to {top_k} SQL variations."""
)

# Prompt to answer the questions
answer_prompt = PromptTemplate.from_template(
    """Given the following user question, corresponding SQL query, and SQL result, answer the user question.
    
Question: {question}
SQL Query: {query}
SQL Result: {result}
Answer: """
)

In [8]:
# Initialize Chain 1: Generate SQL Query
generate_sql_chain = create_sql_query_chain(prompt=SQl_prompt, llm=llm, db=db)

# Initialize Chain 2: Execute SQL Query and generate structured answer
execute_sql_chain = answer_prompt | llm | StrOutputParser()


def execute_combined_chain(question):
    # Step 1: Generate SQL Query
    sql_query = generate_sql_chain.invoke({"question": question, "top_k": 1})
    
    # Step 2: Execute the SQL Query to get the result
    sql_result = execute_query(sql_query)  # Ensure this returns the result of executing the SQL query
    
    # Step 3: Pass the necessary inputs to the final chain and format the output to include both SQL query and result
    final_response = execute_sql_chain.invoke({"question": question, "query": sql_query, "result": sql_result})
    
    # Format the final answer to include both the SQL query and its result
    final_answer = f"SQL Query: {sql_query}\n Answer: {final_response}"
    return final_answer


In [9]:
# Example invocation
question = "what is the volume of model 1?"
final_answer = execute_combined_chain(question)
print(final_answer)

SQL Query: SELECT "Model_Name", SUM("Daily_Volume") as Total_Volume
FROM models
WHERE "Model_Name" = 'Model 1'
 Answer: The volume of model 1 is 477641.


### Further Fine tunning the model

In [None]:
db = SQLDatabase.from_uri("sqlite:///dataset.db")
print(db.dialect)
print(db.get_usable_table_names())

# Define the SQL query for the view
view_query = """
CREATE VIEW model_metrics_view AS
SELECT
    MODEL_ID,
    CASE 
        WHEN "Model_Type" IN ('Multi-Class', 'Classification') 
            THEN json_extract("Performance_Metrics", '$[0]')
    END AS recall,
    CASE 
        WHEN "Model_Type" IN ('Multi-Class', 'Classification') 
            THEN json_extract("Performance_Metrics", '$[1]')
    END AS precision,
    CASE 
        WHEN "Model_Type" IN ('Multi-Class', 'Classification') 
            THEN json_extract("Performance_Metrics", '$[2]')
    END AS accuracy,
    CASE
        WHEN "Model_type" IN ("Regression")
             THEN Performance_Metrics
    END AS MAE
FROM
    models;
"""

# Execute the query to create the view
db.run(view_query)


In [11]:
db.run('SELECT * FROM model_metrics_view')

"[(1, 0.71, 0.76, 0.81, None), (2, 0.76, 0.81, 0.81, None), (3, 0.78, 0.88, 0.93, None), (4, 0.83, 0.91, 0.74, None), (5, 0.86, 0.75, 0.89, None), (6, 0.9, 0.81, 0.75, None), (7, 0.81, 0.76, 0.8, None), (8, 0.92, 0.8, 0.73, None), (9, 0.8, 0.77, 0.86, None), (10, 0.7, 0.91, 0.71, None), (11, 0.92, 0.77, 0.81, None), (12, 0.83, 0.83, 0.7, None), (13, 0.84, 0.87, 0.78, None), (14, 0.82, 0.9, 0.7, None), (15, 0.94, 0.78, 0.87, None), (16, 0.88, 0.93, 0.76, None), (17, 0.76, 0.93, 0.81, None), (18, 0.89, 0.81, 0.83, None), (19, 0.76, 0.78, 0.88, None), (20, 0.86, 0.87, 0.71, None), (21, 0.85, 0.86, 0.73, None), (22, 0.73, 0.76, 0.89, None), (23, 0.76, 0.94, 0.86, None), (24, 0.76, 0.91, 0.72, None), (25, 0.92, 0.71, 0.92, None), (26, 0.92, 0.92, 0.92, None), (27, 0.76, 0.72, 0.82, None), (28, 0.75, 0.88, 0.83, None), (29, 0.89, 0.77, 0.77, None), (30, 0.91, 0.81, 0.79, None), (31, 0.83, 0.84, 0.79, None), (32, 0.92, 0.8, 0.89, None), (33, 0.81, 0.83, 0.75, None), (34, 0.93, 0.84, 0.7, None

In [51]:
SQl_prompt_updated = PromptTemplate.from_template(
    """You are a SQLite expert and Machine Learning Engineer. Given an input question, create a syntactically correct SQLite query to run, to answer the input question.

    **Important:** 
    * For questions about detailed performance metrics (accuracy, MAE, recall, precision), **YOU MUST** JOIN the 'models' table with the 'model_metrics_view' (which contains structured performance data) on the MODEL_ID column and **USE ONLY** columns in 'model_metrics_view' to get accuracy, recall, MAE and precsion and also add a WHERE condition based on the model_type .
    * **YOU MUST check if the model type first based on that as a ML engineer analyse which metric is suitable**
       * Regression models: Use MAE from model_metrics_view
       * Classification models: Use accuracy, precision, recall from model_metrics_view.
    * If the question might yield multiple results, ensure your query returns ALL relevant entries.  Only limit output if the user explicitly asks for a 'best' result.
    * Remember, your query should aim to answer the question with a single SQL statement and limit the results appropriately. You can order the results to return the most informative data in the database.
    * **Check Data Availability:** Before providing a final response, ensure that the data referred to in the query (Model_Name, date, company) likely exists in the database. If potential issues are detected, indicate the problem and suggest alternatives for the user instead of executing the query.
    **Before Execution:** Double-check your query for errors. Is the JOIN with 'model_metrics_view' used when appropriate?  Does the query structure seem likely to return the correct number of results? If you identify any potential issues, rewrite the query and rerun the corrected query. 

**Important:** Pay close attention to relationships between tables and views.  JOIN tables using appropriate conditions.  

    **Schema Summary:**
    * models table: MODEL_ID, Model_Name, Model_Type, ... 
    * model_metrics_view: MODEL_ID, recall, precision, accuracy, MAE

    **Example:**  
    Question: Which models have recall greater than 0.9?
    SELECT Model_Name 
         FROM models 
         JOIN model_metrics_view ON models.MODEL_ID = model_metrics_view.MODEL_ID
         WHERE Model_type IN ('Multi-Class', 'Classification') and accuracy > 0.9;
    
    Question: What is the accuracy of Model 10?
     SELECT accuracy 
         FROM models 
         JOIN model_metrics_view ON models.MODEL_ID = model_metrics_view.MODEL_ID
         WHERE lower(Model_Name) = 'model 10' and Model_type IN ('Multi-Class', 'Classification');
    Answer: Model 10 is a Regression model type so it accuracy is not the right metric to calculate performace of the model

    Question: What is the volume of Model A in CCC company ?
    SELECT Daily_Volume 
         FROM models 
         WHERE lower(Model_Name) = 'model a' and lower(Company_Name) == 'ccc';
    Answer: It seems there is no model named 'Model A' and no company called CCC in the database.  Can you please try with a different model and company name?
    
    Only use the following tables: {table_info}.
    Question: {input}.
    Generate up to {top_k} SQL queries to answer the question.
    """
)

# Prompt to answer the questions
answer_prompt = PromptTemplate.from_template(
    """Given the following user question, corresponding SQL query, and SQL result, answer the user question in conversational tone.
      If the SQL result is empty, provide a helpful message indicating that no matching data was found.  

Question: {question}
SQL Query: {query}
SQL Result: {result}
Answer: """
)

In [52]:
# Initialize Chain 1: Generate SQL Query
generate_sql_chain = create_sql_query_chain(prompt=SQl_prompt_updated, llm=llm, db=db)

# Initialize Chain 2: Execute SQL Query and generate structured answer
execute_sql_chain = answer_prompt | llm | StrOutputParser()


def execute_combined_chain(question):
    # Step 1: Generate SQL Query
    sql_query = generate_sql_chain.invoke({"question": question, "top_k": 1})
    
    # Step 2: Execute the SQL Query to get the result
    sql_result = execute_query(sql_query)  # Ensure this returns the result of executing the SQL query
    
    # Step 3: Pass the necessary inputs to the final chain and format the output to include both SQL query and result
    final_response = execute_sql_chain.invoke({"question": question, "query": sql_query, "result": sql_result})
    
    # Format the final answer to include both the SQL query and its result
    final_answer = f"SQL Query: {sql_query}\n Answer: {final_response}"
    return final_answer

In [54]:
# Example invocation
question = "which model version is better in each model" 
final_answer = execute_combined_chain(question)
print(final_answer)

RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-4 in organization org-K61Z2rr8Hrc4KFIqH9VyWora on tokens per min (TPM): Limit 10000, Requested 47110. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
"Which model in production has the best performance for Nationwide?"
"Accuracy of model 10"
"Which model would you sugest for nationwide"
"which model version is better in each model" 
"give company wise best performing model"