### GOAL - Query big data using AI Agents without writing big data analytics code. <br>
The AI Agent will use built-in strands tools and MCP server to get the job done

## Environment Setup
Install required dependencies for the notebook including Strands SDK, AWS SDK, and MCP client libraries.

In [None]:
import logging

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%m/%d %H:%M:%S',
    filename='strands_debug.log'
)


### Pre-Requisites
You have run the Notebook 1

### Import Dependencies and AWS Configuration
Import required libraries and configure AWS settings for the data processing workflow.

In [None]:
# Import required libraries
import os, time, boto3, json
from strands import Agent, tool
from strands.models import BedrockModel
from strands_tools import use_aws, file_write, file_read, file_write, sleep, python_repl
from datetime import datetime
from pprint import pprint
from pydantic import BaseModel, Field
from typing import Optional, List

# Bypass tool consent for automated execution
os.environ["BYPASS_TOOL_CONSENT"] = "true"
# Specify that if python_repl tool is used, it shouldnt wait for user interaction
os.environ["PYTHON_REPL_INTERACTIVE"] = "false"

## Get Metadata Info from the json file that we created in the previous notebook


In [None]:
# Read the file metadata.json into a dictionary without using file_read
with open('metadata.json', 'r') as f:
    db_metadata = json.load(f)
db_metadata

## MCP Client Setup
We wll get tools exposed by an MCP server to discover partition columns and keys from the data in S3.<br><br>
Initialize the AWS Data Processing MCP server client to provide AI agents with AWS Glue, EMR, and Athena capabilities.

In [None]:
# Set up a session
import boto3

##### CHOOSE THE MOST FIT AWS CREDENTIAL FOR YOUR AWS ENVIRONMENT 

# Option1: Default session
session = boto3.session.Session()

# Option2: Using AWS profile defined in ~/.aws/config
#session = boto3.session.Session(profile_name="<YOUR-AWS-PROFILE-NAME>")

# Option3: Explicit credentials
#session = boto3.session.Session(
#    aws_access_key_id='YOUR_ACCESS_KEY_ID',
#    aws_secret_access_key='YOUR_SECRET_ACCESS_KEY',
#    aws_session_token='YOUR_SESSION_TOKEN' 

# Extract session data
import boto3

region = session.region_name
sts_client = session.client("sts")
response = sts_client.get_caller_identity()
account_id = response.get("Account")
print("Sesseion = ", region, account_id)

credentials = session.get_credentials()
os.environ["AWS_ACCESS_KEY_ID"] = credentials.access_key
os.environ["AWS_SECRET_ACCESS_KEY"] = credentials.secret_key
os.environ["AWS_SESSION_TOKEN"] = credentials.token
os.environ["AWS_REGION"] = region

In [None]:
# Import MCP client libraries
from mcp import stdio_client, StdioServerParameters
from strands.tools.mcp import MCPClient

# Create MCP client for AWS data processing server
# This provides tools for Glue, EMR, and Athena operations
import boto3
session = boto3.Session()
credentials = session.get_credentials()

# Create MCP client for AWS data processing server
# This provides tools for Glue, EMR, and Athena operations
data_mcp_client = MCPClient(lambda: stdio_client(
    StdioServerParameters(
        command="uvx",  # Use uvx to run the MCP server
        args= [
            "awslabs.aws-dataprocessing-mcp-server@latest",
            "--allow-write",  # Enable write operations
        ],
        env= {
            "AWS_ACCESS_KEY_ID": credentials.access_key,
            "AWS_SECRET_ACCESS_KEY": credentials.secret_key,
            "AWS_SESSION_TOKEN": credentials.token,
            "FASTMCP_LOG_LEVEL": "ERROR",  # Minimize logging noise
            "AWS_REGION": session.region_name      # Set AWS region
      }
    )
))

### Let's Ask Natural Language Questions to AI Agent

In [None]:
from utils_big_data import print_tokens_costs, load_system_prompt_from_file
from IPython.display import display, Markdown

# Let's load the system prompt from file for running queries on data in S3 data lake
query_system_prompt = load_system_prompt_from_file("text_to_sql_prompt.txt", db_metadata=db_metadata)
display(Markdown(query_system_prompt))

In [None]:
model_list = ['deepseek.v3-v1:0', 
            'qwen.qwen3-coder-30b-a3b-v1:0',
            'us.anthropic.claude-3-7-sonnet-20250219-v1:0',
            'us.anthropic.claude-sonnet-4-20250514-v1:0',
            'openai.gpt-oss-20b-1:0',
            'openai.gpt-oss-120b-1:0',
            'us.anthropic.claude-haiku-4-5-20251001-v1:0']

# We will use the following model in Strands Agent
model_id = "us.anthropic.claude-haiku-4-5-20251001-v1:0"

# Let's create a reusable function to process a query and return a response in a structured dictionary format
def  get_query_response(query, model_id="us.anthropic.claude-sonnet-4-20250514-v1:0"):

    # We want the response to be in structured dictionary format that returns SQL statement, its reasoning and the final response.
    class SQLQuery(BaseModel):
        sql_statement: str = Field(description="The SQL query that was generated")
        reasoning: str = Field(description="Step by step explanation of how the natural language question was translated to this SQL statement")

    class QueryResponse(BaseModel):        
        sql_queries: List[SQLQuery] = Field(description="List of SQL queries with their reasoning")
        final_response: str = Field(description="The final response generated")

    # Cerate the Bedrock Model using model_id
    model = BedrockModel(model_id=model_id, boto_session=session)

    #Let's us ethe MCP client we created earlier
    with data_mcp_client:
        # Get the data processing tools from MCP server
        data_tools = data_mcp_client.list_tools_sync()

        # Optimize tools by passing just what we need instead of all 32 tools
        curated_data_tools = ['manage_aws_athena_query_executions']

        # Extract just the tools that we need.
        filtered_tools = [tool for tool in data_tools if tool.tool_name in curated_data_tools]

        #Add the following tools so we can generate charts or read / write to files if needed.
        final_tools = [python_repl, file_read, file_write] + filtered_tools
    
        # Pass the system prompt, the LLM we use with bedrock, and all the tools to the agent
        data_lake_agent = Agent(system_prompt = query_system_prompt, model=model, tools=final_tools)

        # Invoke the Agent
        temp_response = data_lake_agent(query)

        #Convert the agents response into a structured output
        response = data_lake_agent.structured_output(QueryResponse, "Extract the structured output of sql queries, reasoning, and the final response")

        #Convert the object into a dictionary
        response_dict = response.model_dump()
        return response_dict

In [None]:
response = get_query_response(f"How many rides went to Airport each month in 2025?")
pprint(response)

In [None]:
response = get_query_response(f"How many taxi vendors are there? Plot a bar chart with ride count and fare amount.")
print(response)

In [None]:
response = get_query_response("For the top 25 percentile of ride fares per yellow / green class, what is ratio of tips to total fare? If I am a taxi driver which routes and times should I drive to get the most tips?")
pprint(response)