In [None]:
%pip -U install boto3 openai ipython



Usage:   
  /opt/homebrew/anaconda3/bin/python -m pip <command> [options]

no such option: -U
Note: you may need to restart the kernel to use updated packages.


In [11]:
import os
import boto3
import json
from openai import OpenAI
from datetime import datetime
from IPython.display import clear_output, display, display_markdown, Markdown


In [None]:
# Model Configuration - Qwen Models
QWEN_MOE_MODEL_ID = "qwen.qwen3-235b-a22b-2507-v1:0"  # MoE model with thinking mode
QWEN_DENSE_MODEL_ID = "qwen.qwen3-32b-v1:0"  # Dense model

print(f"✅ Using MoE model: {QWEN_MOE_MODEL_ID}")
print(f"✅ Using Dense model: {QWEN_DENSE_MODEL_ID}")


✅ Using model: deepseek.v3-v1:0


In [None]:
# Set environment variables to point to Bedrock
# Note: Change the region in the URL to match your preferred region
# Supported regions:
# US: us-east-1, us-west-2
# EU: eu-west-1, eu-west-2, eu-north-1, eu-south-1
# AP: ap-northeast-1, ap-south-1
# SA: sa-east-1
os.environ["AWS_BEARER_TOKEN_BEDROCK"] = "<insert your bedrock API key>"
os.environ["OPENAI_API_KEY"] = "<insert your bedrock API key>"
os.environ["OPENAI_BASE_URL"] = "https://bedrock-runtime.us-west-2.amazonaws.com/openai/v1"

print("✅ Environment configured for Bedrock!")
print("📍 Using us-west-2 region - change the URL above to use a different region")


✅ Environment configured for Bedrock!


In [None]:
# Initialize both clients
# Note: Change region_name to match your preferred region
# Supported regions:
# US: us-east-1, us-west-2
# EU: eu-west-1, eu-west-2, eu-north-1, eu-south-1
# AP: ap-northeast-1, ap-south-1
# SA: sa-east-1
client = OpenAI()  # For chat completions API
bedrock_client = boto3.client('bedrock-runtime', region_name='us-west-2')  

print("✅ OpenAI client initialized (pointing to Bedrock)")
print(f"✅ Bedrock client initialized in region: {bedrock_client.meta.region_name}")
print("📍 Change region_name above to use a different supported region")


✅ OpenAI client initialized (pointing to Bedrock)
✅ Bedrock client initialized in region: us-west-2


In [None]:
# Example 1: Qwen3-32B Dense model (quick response)
response = client.chat.completions.create(
    model=QWEN_DENSE_MODEL_ID,                 
    messages=[
        {"role": "system", "content": "You are a concise, highly logical assistant."},
        {"role": "user",   "content": "What is the largest city in the southern hemisphere?"}
    ],
    temperature=0,
    max_completion_tokens=1000
)

# Extract and print the response text
print("🤖 Qwen3-32B Dense model response:")
print(response.choices[0].message.content)


TypeError: Completions.create() got an unexpected keyword argument 'thinking_mode'

In [None]:
# Example 2: Qwen-3-235B-A22B MoE model with thinking mode (step-by-step reasoning)
response = client.chat.completions.create(
    model=QWEN_MOE_MODEL_ID,                 
    messages=[
        {"role": "system", "content": "You are a helpful assistant that thinks through problems step by step."},
        {"role": "user",   "content": "If a train leaves station A at 60 mph and another leaves station B at 40 mph, and they are 200 miles apart, when will they meet?"}
    ],
    temperature=0,
    max_completion_tokens=2000,
    thinking_mode=True  # Thinking mode for complex reasoning (MoE model only)
)

# Extract and print the response text
print("🧠 Qwen-3-235B-A22B MoE model with thinking mode response:")
print(response.choices[0].message.content)


In [None]:
# Example 3: Qwen-3-235B-A22B MoE model without thinking mode (quick response)
response = client.chat.completions.create(
    model=QWEN_MOE_MODEL_ID,                 
    messages=[
        {"role": "system", "content": "You are a concise, highly logical assistant."},
        {"role": "user",   "content": "What is the capital of France?"}
    ],
    temperature=0,
    max_completion_tokens=1000,
    thinking_mode=False  # Non-thinking mode for quick responses
)

# Extract and print the response text
print("⚡ Qwen-3-235B-A22B MoE model without thinking mode response:")
print(response.choices[0].message.content)


In [None]:
# Streaming with Qwen-3-235B-A22B MoE model thinking mode
streaming_response = client.chat.completions.create(
    model=QWEN_MOE_MODEL_ID,                 
    messages=[
        {"role": "system", "content": "You are a helpful assistant that thinks through problems step by step."},
        {"role": "user",   "content": "Explain how photosynthesis works in simple terms."}
    ],
    temperature=0,
    max_completion_tokens=1500,
    thinking_mode=True,  # Enable thinking mode
    stream=True
)

# Extract and print the response text in real-time.
print("🧠 Streaming Qwen-3-235B-A22B MoE model with thinking mode response:")
for chunk in streaming_response:
    if chunk.choices[0].delta.content is not None:
        print(chunk.choices[0].delta.content, end="")


In [None]:
# Streaming with Qwen3-32B Dense model
streaming_response = client.chat.completions.create(
    model=QWEN_DENSE_MODEL_ID,                 
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user",   "content": "What are the benefits of renewable energy?"}
    ],
    temperature=0,
    max_completion_tokens=1500,
    stream=True
)

# Extract and print the response text in real-time.
print("🤖 Streaming Qwen3-32B Dense model response:")
for chunk in streaming_response:
    if chunk.choices[0].delta.content is not None:
        print(chunk.choices[0].delta.content, end="")


In [None]:
def get_weather(location):
    """
    Get current weather for a given location.
    This is a mock function that returns sample weather data.
    
    Args:
        location (str): City and country, e.g. "Paris, France"
        
    Returns:
        dict: Weather information
    """
    # Mock weather data - in a real application, you'd call a weather API
    weather_data = {
        "Paris, France": {"temperature": "22°C", "condition": "Partly cloudy", "humidity": "65%"},
        "New York, USA": {"temperature": "18°C", "condition": "Sunny", "humidity": "45%"},
        "Tokyo, Japan": {"temperature": "25°C", "condition": "Rainy", "humidity": "80%"},
        "London, UK": {"temperature": "15°C", "condition": "Overcast", "humidity": "70%"},
        "Sydney, Australia": {"temperature": "28°C", "condition": "Clear", "humidity": "55%"}
    }
    
    return weather_data.get(location, {
        "temperature": "20°C", 
        "condition": "Data not available", 
        "humidity": "50%"
    })

# Define the function schema for OpenAI SDK
tools = [{
    "type": "function",
    "function": {
        "name": "get_weather",
        "description": "Get current temperature and weather conditions for a given location.",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "City and country, e.g. 'Paris, France'"
                }
            },
            "required": ["location"],
            "additionalProperties": False
        }
    }
}]

print("✅ Weather function and tools configuration ready!")


In [None]:
def chat_with_functions(client, model, messages, tools, max_iterations=3, thinking_mode=False):
    """
    Chat with function calling support using OpenAI SDK format.
    
    Args:
        client: OpenAI client instance
        model: Model ID to use
        messages: List of conversation messages
        tools: List of available tools/functions
        max_iterations: Maximum number of function call iterations
        thinking_mode: Whether to use thinking mode for enhanced reasoning (MoE model only)
        
    Returns:
        Final assistant message
    """
    
    for iteration in range(max_iterations):
        print(f"🔄 Iteration {iteration + 1}")
        
        # Make request with tools
        request_params = {
            "model": model,
            "messages": messages,
            "tools": tools,
            "tool_choice": "auto"
        }
        
        # Add thinking_mode only for MoE model
        if thinking_mode and "qwen3-235b-a22b" in model:
            request_params["thinking_mode"] = thinking_mode
        
        response = client.chat.completions.create(**request_params)
        
        assistant_message = response.choices[0].message
        messages.append(assistant_message)
        
        # Check if the model wants to call functions
        if assistant_message.tool_calls:
            print(f"🔧 Model requested {len(assistant_message.tool_calls)} function call(s)")
            
            # Process each function call
            for tool_call in assistant_message.tool_calls:
                function_name = tool_call.function.name
                function_args = json.loads(tool_call.function.arguments)
                
                print(f"🔧 Calling function: {function_name}")
                print(f"🔧 Arguments: {function_args}")
                
                # Call the actual function
                if function_name == "get_weather":
                    function_result = get_weather(function_args["location"])
                    print(f"🔧 Function result: {function_result}")
                else:
                    function_result = {"error": f"Unknown function: {function_name}"}
                
                # Add function result to conversation
                function_message = {
                    "tool_call_id": tool_call.id,
                    "role": "tool",
                    "content": json.dumps(function_result)
                }
                messages.append(function_message)
                
        else:
            # No more function calls, return final response
            print("✅ No function calls requested, conversation complete")
            return assistant_message
    
    print("⚠️ Maximum iterations reached")
    return assistant_message

print("✅ Enhanced function calling handler ready!")


In [None]:
# Test enhanced function calling with both models
weather_questions = [
    "What's the weather like in Paris today?",
    "Can you tell me the temperature in Tokyo?",
    "How's the weather in Sydney, Australia?",
    "What are the conditions like in New York?"
]

print("🌤️ Testing Enhanced Function Calling with Qwen Models")
print("=" * 60)

# Test with Qwen3-32B Dense model
print("\n🤖 Testing with Qwen3-32B Dense Model")
print("-" * 40)

for i, question in enumerate(weather_questions[:2], 1):  # Test first 2 questions
    print(f"\n📝 Test {i}: {question}")
    print("-" * 20)
    
    try:
        # Create conversation messages
        messages = [
            {"role": "system", "content": "You are a helpful weather assistant. Use the get_weather function to provide accurate weather information."},
            {"role": "user", "content": question}
        ]
        
        # Call the function calling handler with dense model
        final_response = chat_with_functions(
            client=client,
            model=QWEN_DENSE_MODEL_ID,
            messages=messages,
            tools=tools,
            thinking_mode=False  # Dense model doesn't support thinking mode
        )
        
        # Print the final response
        print("🤖 Final response:")
        print(final_response.content)
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
    
    print()

# Test with Qwen-3-235B-A22B MoE model
print("\n🧠 Testing with Qwen-3-235B-A22B MoE Model (with thinking mode)")
print("-" * 50)

for i, question in enumerate(weather_questions[2:], 1):  # Test last 2 questions
    print(f"\n📝 Test {i}: {question}")
    print("-" * 20)
    
    try:
        # Create conversation messages
        messages = [
            {"role": "system", "content": "You are a helpful weather assistant. Use the get_weather function to provide accurate weather information."},
            {"role": "user", "content": question}
        ]
        
        # Call the function calling handler with MoE model and thinking mode
        final_response = chat_with_functions(
            client=client,
            model=QWEN_MOE_MODEL_ID,
            messages=messages,
            tools=tools,
            thinking_mode=True  # Enable thinking mode for MoE model
        )
        
        # Print the final response
        print("🧠 Final response:")
        print(final_response.content)
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
    
    print()


In [None]:
# Configure region for Bedrock client
# Supported regions:
# US: us-east-1, us-west-2
# EU: eu-west-1, eu-west-2, eu-north-1, eu-south-1
# AP: ap-northeast-1, ap-south-1
# SA: sa-east-1
region = None

if region is None:
    target_region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION"))
else:
    target_region = "us-west-2"

bedrock_runtime = boto3.client('bedrock-runtime', region_name=region)
print(f"📍 Using region: {target_region} - change the region variable above to use a different supported region")


In [None]:
def invoke_model(body, model_id, accept, content_type):
    """
    Invokes Amazon bedrock model to run an inference
    using the input provided in the request body.
    
    Args:
        body (dict): The invokation body to send to bedrock
        model_id (str): the model to query
        accept (str): input accept type
        content_type (str): content type
    Returns:
        Inference response from the model.
    """

    try:
        response = bedrock_runtime.invoke_model(
            body=json.dumps(body), 
            modelId=model_id, 
            accept=accept, 
            contentType=content_type
        )

        return response

    except Exception as e:
        print(f"Couldn't invoke {model_id}")
        raise e


In [None]:
# Example with Qwen-3-235B-A22B MoE model and thinking mode
messages = [
    {"role": "system", "content": "You are a concise, highly logical assistant."},
    {"role": "user",   "content": "What is the largest city in the southern hemisphere?"}
]

body = {
    "messages": messages,
    "temperature": 0,
    "max_completion_tokens": 1000,
    "thinking_mode": True  
}

accept = "application/json"
contentType = "application/json"

response = invoke_model(body, QWEN_MOE_MODEL_ID, accept, contentType)
response_body = json.loads(response.get("body").read())

print("🧠 Thinking mode response:")
print(response_body['choices'][0]['message']['content'])


In [None]:
# Example with Qwen3-32B Dense model
messages = [
    {"role": "system", "content": "You are a concise, highly logical assistant."},
    {"role": "user",   "content": "What is the largest city in the southern hemisphere?"}
]

body = {
    "messages": messages,
    "temperature": 0,
    "max_completion_tokens": 1000,
    "thinking_mode": True  
}

accept = "application/json"
contentType = "application/json"

response = invoke_model(body, QWEN_DENSE_MODEL_ID, accept, contentType)
response_body = json.loads(response.get("body").read())

print("📝 Response:")
print(response_body['choices'][0]['message']['content'])

In [None]:
# Streaming with Qwen-3-235B-A22B MoE model and thinking mode
messages = [
    {"role": "system", "content": "You are a concise, highly logical assistant."},
    {"role": "user",   "content": "What is the largest city in the southern hemisphere?"}
]

body = {
    "messages": messages,
    "temperature": 0,
    "max_completion_tokens": 1000,
    "thinking_mode": True  # Enable thinking mode for streaming
}

accept = "application/json"
contentType = "application/json"

start_time = datetime.now()

response = bedrock_runtime.invoke_model_with_response_stream(
    body=json.dumps(body), modelId=QWEN_MOE_MODEL_ID, accept=accept, contentType=contentType
)
chunk_count = 0
time_to_first_token = None

# Process the response stream
stream = response.get("body")
if stream:
    print("🧠 Streaming thinking mode response:")
    for event in stream:
        chunk = event.get("chunk")
        if chunk:
            # Print the response chunk
            chunk_json = json.loads(chunk.get("bytes").decode())
            content_block_delta = chunk_json.get("choices")[0]["delta"].get("content")
            if content_block_delta:
                if time_to_first_token is None:
                    time_to_first_token = datetime.now() - start_time
                    print(f"Time to first token: {time_to_first_token}")

                chunk_count += 1
                print(content_block_delta, end="")
    print(f"\nTotal chunks: {chunk_count}")
else:
    print("No response stream received.")


In [None]:
# Converse API with Qwen3-32B Dense model (no thinking mode)
response = bedrock_client.converse(
    modelId=QWEN_DENSE_MODEL_ID,
    messages=[
        {
            "role": "user",
            "content": [{"text": "What is the capital of Australia?"}]
        }
    ],
    system=[{"text": "You are a concise, highly logical assistant."}],
    inferenceConfig={
        "temperature": 0,
        "maxTokens": 1000
    }
    # Note: No additionalModelRequestFields needed for dense model
)

# Final response (dense model doesn't have reasoning trace)
print(f"🤖 Dense model response:")
print(response['output']['message']['content'][0]['text'])


In [None]:
# Converse API with Qwen-3-235B-A22B MoE model and thinking mode
response = bedrock_client.converse(
    modelId=QWEN_MOE_MODEL_ID,
    messages=[
        {
            "role": "user",
            "content": [{"text": "How far from earth is the moon?"}]
        }
    ],
    system=[{"text": "You are a concise, highly logical assistant."}],
    inferenceConfig={
        "temperature": 0,
        "maxTokens": 1000
    },
    additionalModelRequestFields={
        "thinking_mode": True  # Enable thinking mode for MoE model
    }
)

# Message dict
print(f"📝 Message dict:")
print(response['output']['message']['content'])

# Reasoning trace (if available)
if 'reasoningContent' in response['output']['message']['content'][0]:
    print(f"📝 Reasoning trace:")
    print(response['output']['message']['content'][0]['reasoningContent']['reasoningText']['text'])

# Final response
print(f"📝 Final response:")
print(response['output']['message']['content'][1]['text'])

In [None]:
# Streaming through Converse API with Qwen3-32B Dense model 
def bedrock_model_converse_stream_dense(client, system_prompt, user_prompt, max_tokens=1000, temperature=0):
    response = ""
    response = client.converse_stream(
        modelId=QWEN_DENSE_MODEL_ID,
        messages=[  
            {
                "role": "user",
                "content": [
                    {
                        "text": user_prompt
                    }
                ]
            },                        
        ],
        system=[{"text": system_prompt}],
        inferenceConfig={
            "temperature": temperature,
            "maxTokens": max_tokens
        }
        # Note: No additionalModelRequestFields needed for dense model
    )
    # Extract and print the response text in real-time.
    for event in response['stream']:
        if 'contentBlockDelta' in event:
            chunk = event['contentBlockDelta']
            if chunk['delta'].get('text', None):
                print(chunk['delta']['text'], end="")
    return


In [None]:
# Streaming through Converse API with Qwen-3-235B-A22B MoE model and thinking mode

def bedrock_model_converse_stream_moe(client, system_prompt, user_prompt, max_tokens=1000, temperature=0, thinking_mode=True):
    response = ""
    response = client.converse_stream(
        modelId=QWEN_MOE_MODEL_ID,
        messages=[  
            {
                "role": "user",
                "content": [
                    {
                        "text": user_prompt
                    }
                ]
            },                        
        ],
        system=[{"text": system_prompt}],
        inferenceConfig={
            "temperature": temperature,
            "maxTokens": max_tokens
        },
        additionalModelRequestFields={
            "thinking_mode": thinking_mode  # Enable thinking mode for Qwen-3-235B-A22B MoE model
        }
    )
    # Extract and print the response text in real-time.
    for event in response['stream']:
        if 'contentBlockDelta' in event:
            chunk = event['contentBlockDelta']
            if chunk['delta'].get('reasoningContent', None):
                print(chunk['delta']['reasoningContent']['text'], end="")
            if chunk['delta'].get('text', None):
                print(chunk['delta']['text'], end="")
    return


In [None]:
# Example usage of streaming functions

print("\n\n🤖 Streaming with Qwen3-32B Dense model:")
bedrock_model_converse_stream_dense(
    client=bedrock_client,
    system_prompt="You are a helpful assistant.",
    user_prompt="What are the benefits of renewable energy?"
)

print("🧠 Streaming with Qwen-3-235B-A22B MoE model (thinking mode):")
bedrock_model_converse_stream_moe(
    client=bedrock_client,
    system_prompt="You are a helpful assistant that thinks through problems step by step.",
    user_prompt="Explain how a computer works in simple terms.",
    thinking_mode=True
)
