In [1]:
# Import required libraries for database connection and data handling 
from sqlalchemy import create_engine
import pandas as pd

import os  # Import the os module to interact with the operating system
from dotenv import load_dotenv  # Import the load_dotenv function to load environment variables from a .env file

import openai  # Import the OpenAI library to interact with the OpenAI API
from openai import Client  # Import the Client class from the openai module


In [2]:


# Set up the connection string for SQL Server using SQLAlchemy
connection_string = (
    "mssql+pyodbc://"
    "DESKTOP-SNEPGI0/"  # Server name
    "AdventureWorks2019?"  # Database name
    "driver=ODBC+Driver+17+for+SQL+Server&"  # ODBC driver
    "trusted_connection=yes"  # Use Windows authentication
)

# Create a SQLAlchemy engine for managing the database connection
engine = create_engine(connection_string)

# Define a function to execute SQL queries
def execute_query(query, engine):
    """
    Executes an SQL query and returns the result as a DataFrame.
    """
    try:
        result_df = pd.read_sql(query, engine)
        return result_df
    except Exception as e:
        print(f"Error executing query: {e}")
        return None

# Test the function with a sample query
query = """
SELECT SalesOrderID, SubTotal, TaxAmt, Freight, TotalDue
FROM Sales.SalesOrderHeader
"""

result = execute_query(query, engine)

if result is not None:
    print("Query Result:\n", result.head())


Query Result:
    SalesOrderID    SubTotal     TaxAmt   Freight    TotalDue
0         43659  20565.6206  1971.5149  616.0984  23153.2339
1         43660   1294.2529   124.2483   38.8276   1457.3288
2         43661  32726.4786  3153.7696  985.5530  36865.8012
3         43662  28832.5289  2775.1646  867.2389  32474.9324
4         43663    419.4589    40.2681   12.5838    472.3108


In [3]:
# Check for null values in the DataFrame and print the count of nulls for each column
# print(df.isnull().sum())

In [4]:
# Identify rows with invalid values (negative SubTotal, TaxAmt, or Freight)
# invalid_values = df[(df['SubTotal'] < 0) | (df['TaxAmt'] < 0) | (df['Freight'] < 0)]
# print(invalid_values)

In [5]:
# Load environment variables from a .env file
load_dotenv('.env')

# Retrieve the OpenAI API key from the environment variables
openai_api_key = os.environ["OPENAI_API_KEY"]
# Set the OpenAI API key
openai.api_key = openai_api_key

In [21]:
import openai
def natural_to_sql(question, database_name="AdventureWorks2019"):
    """
    Convert a natural language question about the AdventureWorks database to SQL using GPT-4.
    
    Parameters:
    question (str): The natural language question to convert to SQL.
    database_name (str): The name of the database to query. Default is "AdventureWorks2019".
    
    Returns:
    str: The SQL query generated by GPT-4.
    """
    # Create a chat completion request to the OpenAI API using the GPT-4 model
    client = Client()
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": f"You are a SQL assistant for {database_name} database. Your task is to generate SQL queries."},  # System message to set the context
            {"role": "user", "content": f"Generate only a single SQL query without any explanation or comments to answer the following question: {question}"}  # User message containing the natural language question
        ]
    )
    # Return the generated SQL query from the response
    return response.choices[0].message.content
    #print(response.choices[0].message.content)


In [22]:
# Example usage:
question = "What is the best-selling product?"
#print("Question:", question)
sql_query = natural_to_sql(question)
print(sql_query)

SELECT TOP 1 
    p.Name AS ProductName, 
    SUM(od.OrderQty) AS TotalQuantitySold

FROM 
    Production.Product p
JOIN 
    Sales.SalesOrderDetail od ON p.ProductID = od.ProductID

GROUP BY 
    p.Name

ORDER BY 
    TotalQuantitySold DESC;


In [23]:
def execute_query(query, engine):
    """
    Executes an SQL query and returns the result as a DataFrame.
    """
    try:
        result_df = pd.read_sql(query, engine)
        return result_df
    except Exception as e:
        print(f"Error executing query: {e}")
        return None


In [24]:
result = execute_query(sql_query, engine)
if result is not None:
        print("Query Result:\n", result)

Query Result:
     ProductName  TotalQuantitySold
0  AWC Logo Cap               8311
