In [None]:
def main():
    from optimum.exporters.onnx.model_configs import NormalizedConfig

    class Phi3NormalizedConfig(NormalizedConfig):
        """
        Minimal normalized config for a Phi3-based causal language model.
        Feel free to expand with other fields as needed by your model.
        """
        def __init__(self, config):
            super().__init__(config)
            # Assign typical fields for a causal LM:
            self._num_layers = getattr(config, "num_hidden_layers", None)
            self._hidden_size = getattr(config, "hidden_size", None)
            self._num_attention_heads = getattr(config, "num_attention_heads", None)
            # Optionally store the model type
            self._model_type = getattr(config, "model_type", "phi3")

        @property
        def num_layers(self) -> int:
            return self._num_layers

        @property
        def hidden_size(self) -> int:
            return self._hidden_size

        @property
        def num_attention_heads(self) -> int:
            return self._num_attention_heads

        @property
        def model_type(self) -> str:
            return self._model_type
        
    from optimum.exporters.onnx import OnnxConfig

    class Phi3CustomOnnxConfig(OnnxConfig):
        """
        Minimal OnnxConfig that declares the input/output specifications
        for a Phi3-based causal LM. 
        """
        NORMALIZED_CONFIG_CLASS = Phi3NormalizedConfig

        @property
        def inputs(self):
            # For causal LM inference, we typically need `input_ids` and `attention_mask`.
            # We map dimensions to ONNX-friendly names like "batch_size" and "sequence_length".
            return {
                "input_ids": {
                    0: "batch_size",
                    1: "sequence_length"
                },
                "attention_mask": {
                    0: "batch_size",
                    1: "sequence_length"
                },
            }

        @property
        def outputs(self):
            # The usual main output is logits of shape (batch_size, sequence_length, vocab_size).
            return {
                "logits": {
                    0: "batch_size",
                    1: "sequence_length"
                },
            }

        @property
        def default_onnx_opset_version(self) -> int:
            return 15  # Usually safe to go with opset 13 or 15

        def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
            """
            Provide minimal dummy tensors to trace the model.
            Adjust shapes as needed for your typical input size.
            """
            import torch

            batch_size = 1
            seq_length = 4

            dummy_input_ids = torch.randint(
                low=0,
                high=1000,
                size=(batch_size, seq_length),
                dtype=torch.long
            )
            dummy_attention_mask = torch.ones(
                (batch_size, seq_length),
                dtype=torch.long
            )
            return {
                "input_ids": dummy_input_ids,
                "attention_mask": dummy_attention_mask
            }
    
    
    from optimum.exporters.onnx import export
    from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
    from pathlib import Path

    model_id = "microsoft/phi-3-mini-4k-instruct"
    onnx_dir = Path("./phi3_mini_onnx")
    onnx_dir.mkdir(parents=True, exist_ok=True)

    # 1. Load the model and config with `trust_remote_code=True`
    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

    # 2. Create an instance of your custom OnnxConfig
    onnx_config = Phi3CustomOnnxConfig(config)

    # 3. Export the model
    export(
        model=model,
        config=onnx_config,
        output=onnx_dir / "model.onnx",
        opset=15  # or 13, matching your custom config
    )

    # 4. Save tokenizer & config
    tokenizer.save_pretrained(onnx_dir)
    config.save_pretrained(onnx_dir)
    print("Done exporting Phi-3 Mini to ONNX.")
    
    
if __name__ == "__main__":
    main()


In [None]:
def main():
    from optimum.onnxruntime import ORTQuantizer
    from optimum.onnxruntime.configuration import AutoQuantizationConfig
    from pathlib import Path

    onnx_dir = Path("./phi3_mini_onnx")
    quantized_path = Path("./phi3_mini_quantized")
    quantized_path.mkdir(parents=True, exist_ok=True)

    quantizer = ORTQuantizer.from_pretrained(onnx_dir)

    quant_config = AutoQuantizationConfig.arm64(
        is_static=False,    # dynamic quantization
        per_channel=True    # might produce smaller result, but more memory usage
    )

    # Note: The key fix is `use_external_data_format=True`
    quantizer.quantize(
        quantization_config=quant_config,
        save_dir=quantized_path,
        use_external_data_format=True
    )

    print("Quantized model saved to:", quantized_path)

if __name__ == "__main__":
    main()

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM

# Load Phi-3 mini model and tokenizer
model_id = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # Use half precision for efficiency
    device_map="auto"  # Automatically choose best device (CPU/GPU)
)
# Benchmark quantized model performance
model_quantized = ORTModelForCausalLM.from_pretrained("./phi3_mini_quantized", use_io_binding=False, use_cache=False)

# Test basic inference
def generate_response(model, prompt, max_length=512):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate response
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.replace(prompt, "").strip()

# Test with different prompts
test_prompts = [
    "Hello, who are you?",
    "Can you help me find a good restaurant?",
    "What's the capital of France?"
]

for prompt in test_prompts:
    response = generate_response(model, prompt)
    print(f"Prompt: {prompt}\nResponse: {response}\n")
    
for prompt in test_prompts:
    response = generate_response(model_quantized, prompt)
    print(f"Prompt: {prompt}\nResponse: {response}\n")
    

import time
import psutil
import matplotlib.pyplot as plt

def benchmark_inference(model, prompt, runs=5):
    memory_usage = []
    inference_times = []
    
    for _ in range(runs):
        # Record memory before
        mem_before = psutil.Process().memory_info().rss / 1024 / 1024  # MB
        
        # Time inference
        start_time = time.time()
        _ = generate_response(model, prompt)
        end_time = time.time()
        
        # Record memory after
        mem_after = psutil.Process().memory_info().rss / 1024 / 1024  # MB
        
        inference_times.append(end_time - start_time)
        memory_usage.append(mem_after - mem_before)
    
    return {
        "avg_inference_time": sum(inference_times) / runs,
        "avg_memory_usage": sum(memory_usage) / runs
    }

# Run benchmarks
results = benchmark_inference(model, "Tell me about yourself and what you can do.")
print(f"Average inference time: {results['avg_inference_time']:.2f} seconds")
print(f"Average memory usage: {results['avg_memory_usage']:.2f} MB")

# Run benchmarks
results = benchmark_inference(model_quantized, "Tell me about yourself and what you can do.")
print(f"Average inference time: {results['avg_inference_time']:.2f} seconds")
print(f"Average memory usage: {results['avg_memory_usage']:.2f} MB")

----------------------------------------------------

# Developing an On-Device AI Chatbot Powered by Phi-3 Mini: A Complete Guide

Building an on-device AI chatbot with Phi-3 mini requires careful planning from model optimization to deployment. I'll walk you through the entire process of creating a privacy-focused, personalized AI companion that can run efficiently on mobile devices.


## Phase 1: Model Testing and Exploration


### Step 1: Set Up Your Development Environment


In [None]:
%%bash
# Create and activate a virtual environment
python -m venv phi3_env
source phi3_env/bin/activate  # On Windows: phi3_env\Scripts\activate

# Install required packages
pip install torch transformers datasets evaluate accelerate jupyter matplotlib numpy pandas
pip install tensorboard optimum onnxruntime

In [None]:
%%bash
# Alternatively, you can run the following commands in your terminal to install the required packages:
# Verify Nvidia GPU driver installation and CUDA version
nvidia-smi
nvcc --version
ldd --version
strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 | grep GLIBCXX
strings /usr/local/libstdcxx/lib64/libstdc++.so.6 | grep GLIBCXX


import torch
print(torch.__version__, torch.version.cuda, torch.cuda.is_available())

pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install torch==2.0.1+cu118 \
            torchvision==0.15.2+cu118 \
            torchaudio==2.0.1+cu118 \
            --extra-index-url https://download.pytorch.org/whl/cu118


### Step 2: Initial Model Testing in Jupyter Notebook


Create a new notebook to explore Phi-3 mini capabilities:


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load Phi-3 mini model and tokenizer
model_id = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # Use half precision for efficiency
    device_map="auto"  # Automatically choose best device (CPU/GPU)
)

# Test basic inference
def generate_response(prompt, max_length=512):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate response
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    
    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.replace(prompt, "").strip()

# Test with different prompts
test_prompts = [
    "Hello, who are you?",
    "Can you help me find a good restaurant?",
    "What's the capital of France?"
]

for prompt in test_prompts:
    response = generate_response(prompt)
    print(f"Prompt: {prompt}\nResponse: {response}\n")

### Step 3: Benchmark Model Performance


In [None]:
import time
import psutil
import matplotlib.pyplot as plt

def benchmark_inference(prompt, runs=5):
    memory_usage = []
    inference_times = []
    
    for _ in range(runs):
        # Record memory before
        mem_before = psutil.Process().memory_info().rss / 1024 / 1024  # MB
        
        # Time inference
        start_time = time.time()
        _ = generate_response(prompt)
        end_time = time.time()
        
        # Record memory after
        mem_after = psutil.Process().memory_info().rss / 1024 / 1024  # MB
        
        inference_times.append(end_time - start_time)
        memory_usage.append(mem_after - mem_before)
    
    return {
        "avg_inference_time": sum(inference_times) / runs,
        "avg_memory_usage": sum(memory_usage) / runs
    }

# Run benchmarks
results = benchmark_inference("Tell me about yourself and what you can do.")
print(f"Average inference time: {results['avg_inference_time']:.2f} seconds")
print(f"Average memory usage: {results['avg_memory_usage']:.2f} MB")

## Phase 2: Model Optimization for Mobile Deployment


### Step 4: Quantize the Model


In [None]:
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import AutoModelForCausalLM

# Export model to ONNX format
from optimum.onnxruntime import ORTModelForCausalLM

# First convert to ONNX
onnx_model_path = "./phi3_mini_onnx"
ort_model = ORTModelForCausalLM.from_pretrained(
    model_id, 
    export=True,
    provider="CPUExecutionProvider"
)
ort_model.save_pretrained(onnx_model_path)

# Then quantize
quantizer = ORTQuantizer.from_pretrained(onnx_model_path)
qconfig = AutoQuantizationConfig.arm64(is_static=True, per_channel=False)
quantizer.quantize(quantization_config=qconfig, save_dir="./phi3_mini_quantized")

# Benchmark quantized model performance
model_quantized = ORTModelForCausalLM.from_pretrained("./phi3_mini_quantized")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Test inference speed and compare to original model

### Step 5: Optimize Model Architecture


In [None]:
# Pruning example - removing less important weights
from transformers import AutoModelForCausalLM
from transformers.pruning_utils import prune_linear_layer
import torch

def prune_model(model, pruning_threshold=0.1):
    """Prune model weights below a certain threshold"""
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            # Get weight magnitudes
            weight_magnitudes = torch.abs(module.weight)
            
            # Create mask for weights below threshold
            mask = weight_magnitudes < pruning_threshold
            
            # Set small weights to zero
            module.weight.data[mask] = 0
    
    return model

# Apply pruning
pruned_model = prune_model(model, pruning_threshold=0.05)

# Save pruned model
pruned_model.save_pretrained("./phi3_mini_pruned")

### Step 6: Export for Mobile


In [None]:
# For iOS (Core ML)
import coremltools as ct

# Convert to Core ML format
mlmodel = ct.convert(
    "phi3_mini_quantized",
    source="onnx",
    minimum_deployment_target=ct.target.iOS15
)
mlmodel.save("PhiMiniModel.mlpackage")

## Phase 3: Backend Development


### Step 7: Create Project Structure


In [None]:
%%bash
mkdir -p phi3_chatbot/{api,models,utils,config,tests}
cd phi3_chatbot

### Step 8: Implement Basic API with FastAPI


Create a `main.py` file in the api directory:


In [None]:
from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel
from typing import List, Optional
import uuid
import datetime

app = FastAPI(title="Phi-3 Mini Chatbot API")

# --- Data Models ---
class Message(BaseModel):
    role: str  # 'user' or 'assistant'
    content: str
    timestamp: datetime.datetime = datetime.datetime.now()

class Conversation(BaseModel):
    id: str = str(uuid.uuid4())
    messages: List[Message] = []
    created_at: datetime.datetime = datetime.datetime.now()
    updated_at: datetime.datetime = datetime.datetime.now()

class ChatRequest(BaseModel):
    conversation_id: Optional[str] = None
    message: str
    system_prompt: Optional[str] = None

class ChatResponse(BaseModel):
    conversation_id: str
    response: str
    
# In-memory store for conversations (replace with proper DB in production)
conversations = {}

# --- Model Initialization ---
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "../phi3_mini_quantized"  # Path to your quantized model
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
model = AutoModelForCausalLM.from_pretrained(model_path)

# --- API Endpoints ---
@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
    # Get or create conversation
    if request.conversation_id and request.conversation_id in conversations:
        conversation = conversations[request.conversation_id]
    else:
        conversation = Conversation()
        conversations[conversation.id] = conversation
    
    # Add user message
    conversation.messages.append(Message(role="user", content=request.message))
    
    # Prepare prompt with conversation history
    system_prompt = request.system_prompt or "You are a helpful AI assistant running on-device."
    prompt = f"{system_prompt}\n\n"
    
    # Add conversation history
    for msg in conversation.messages[-5:]:  # Only use last 5 messages for context
        prompt += f"{msg.role.capitalize()}: {msg.content}\n"
    
    prompt += "Assistant: "
    
    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs.input_ids,
        max_length=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response_text = response_text.replace(prompt, "").strip()
    
    # Add assistant message to conversation
    conversation.messages.append(Message(role="assistant", content=response_text))
    conversation.updated_at = datetime.datetime.now()
    
    return ChatResponse(
        conversation_id=conversation.id,
        response=response_text
    )

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

### Step 9: Implement Conversation Memory


Create a `memory.py` file in the utils directory:


In [None]:
from typing import List, Dict, Any
import datetime

class ConversationMemory:
    def __init__(self, max_history=10):
        self.max_history = max_history
        self.conversations = {}
    
    def add_message(self, conversation_id, role, content):
        # Create conversation if it doesn't exist
        if conversation_id not in self.conversations:
            self.conversations[conversation_id] = {
                "messages": [],
                "created_at": datetime.datetime.now(),
                "metadata": {}
            }
        
        # Add message
        self.conversations[conversation_id]["messages"].append({
            "role": role,
            "content": content,
            "timestamp": datetime.datetime.now()
        })
        
        # Trim history if needed
        if len(self.conversations[conversation_id]["messages"]) > self.max_history:
            self.conversations[conversation_id]["messages"] = self.conversations[conversation_id]["messages"][-self.max_history:]
    
    def get_conversation(self, conversation_id):
        return self.conversations.get(conversation_id, None)
    
    def get_messages(self, conversation_id, limit=None):
        if conversation_id not in self.conversations:
            return []
        
        messages = self.conversations[conversation_id]["messages"]
        if limit:
            return messages[-limit:]
        return messages
    
    def store_metadata(self, conversation_id, key, value):
        if conversation_id not in self.conversations:
            return False
        
        self.conversations[conversation_id]["metadata"][key] = value
        return True
    
    def get_metadata(self, conversation_id, key=None):
        if conversation_id not in self.conversations:
            return None
        
        if key:
            return self.conversations[conversation_id]["metadata"].get(key, None)
        return self.conversations[conversation_id]["metadata"]

## Phase 4: Cloud Infrastructure Setup


### Step 10: Set Up Cloud Resources on GCP


In [None]:
%%bash
# Install Google Cloud SDK
# https://cloud.google.com/sdk/docs/install

# Initialize GCP
gcloud init

# Create a new project
gcloud projects create phi3-chatbot-app

# Set the project as active
gcloud config set project phi3-chatbot-app

# Enable required APIs
gcloud services enable artifactregistry.googleapis.com
gcloud services enable run.googleapis.com
gcloud services enable firestore.googleapis.com

# Create a Docker repository in Artifact Registry
gcloud artifacts repositories create phi3-chatbot-repo \
    --repository-format=docker \
    --location=us-central1 \
    --description="Docker repository for Phi-3 Chatbot"

# Create a Firestore database
gcloud firestore databases create --region=us-central1

### Step 11: Configure Cloud Storage for Models


In [None]:
%%bash
# Create Cloud Storage bucket for model files
gcloud storage buckets create gs://phi3-chatbot-models \
    --location=us-central1 \
    --uniform-bucket-level-access

# Upload model files to bucket
gcloud storage cp ./phi3_mini_quantized/* gs://phi3-chatbot-models/phi3_mini_quantized/

### Step 12: Set Up Containerization


Create a `Dockerfile`:

Create a `requirements.txt` file:


## Phase 5: Mobile App Development


### Step 13: Create a Simple React Native Frontend


In [None]:
%%bash
# Install React Native CLI
npm install -g react-native-cli

# Create a new React Native project
npx react-native init Phi3ChatbotApp

# Navigate to the project directory
cd Phi3ChatbotApp

# Install required dependencies
npm install @react-navigation/native @react-navigation/stack
npm install react-native-gesture-handler react-native-safe-area-context
npm install axios react-native-gifted-chat react-native-vector-icons

### Step 14: Implement Chat Interface


Create a `ChatScreen.js` file:


In [None]:
import React, { useState, useCallback, useEffect } from 'react';
import { GiftedChat } from 'react-native-gifted-chat';
import { ActivityIndicator, View, Text, StyleSheet } from 'react-native';
import axios from 'axios';

const API_URL = 'https://your-api-url.com'; // Replace with your actual API URL

const ChatScreen = () => {
  const [messages, setMessages] = useState([]);
  const [conversationId, setConversationId] = useState(null);
  const [loading, setLoading] = useState(false);

  useEffect(() => {
    // Initialize with a welcome message
    setMessages([
      {
        _id: 1,
        text: 'Hello! I\'m your Phi-3 powered AI assistant. How can I help you today?',
        createdAt: new Date(),
        user: {
          _id: 2,
          name: 'AI Assistant',
          avatar: 'https://placeimg.com/140/140/tech',
        },
      },
    ]);
  }, []);

  const onSend = useCallback((newMessages = []) => {
    setMessages(previousMessages => 
      GiftedChat.append(previousMessages, newMessages)
    );
    
    // Send message to API
    const userMessage = newMessages[0].text;
    setLoading(true);
    
    axios.post(`${API_URL}/chat`, {
      conversation_id: conversationId,
      message: userMessage,
    })
    .then(response => {
      // Save conversation ID for future messages
      if (!conversationId) {
        setConversationId(response.data.conversation_id);
      }
      
      // Add AI response to chat
      const aiMessage = {
        _id: Math.round(Math.random() * 1000000),
        text: response.data.response,
        createdAt: new Date(),
        user: {
          _id: 2,
          name: 'AI Assistant',
          avatar: 'https://placeimg.com/140/140/tech',
        },
      };
      
      setMessages(previousMessages => 
        GiftedChat.append(previousMessages, [aiMessage])
      );
    })
    .catch(error => {
      console.error('Error sending message:', error);
      
      // Show error message
      const errorMessage = {
        _id: Math.round(Math.random() * 1000000),
        text: 'Sorry, I encountered an error. Please try again.',
        createdAt: new Date(),
        user: {
          _id: 2,
          name: 'AI Assistant',
          avatar: 'https://placeimg.com/140/140/tech',
        },
      };
      
      setMessages(previousMessages => 
        GiftedChat.append(previousMessages, [errorMessage])
      );
    })
    .finally(() => {
      setLoading(false);
    });
  }, [conversationId]);

  return (
    <View style={styles.container}>
      {loading && (
        <View style={styles.loadingContainer}>
          <ActivityIndicator size="large" color="#0000ff" />
          <Text style={styles.loadingText}>Thinking...</Text>
        </View>
      )}
      <GiftedChat
        messages={messages}
        onSend={messages => onSend(messages)}
        user={{
          _id: 1,
        }}
        renderAvatar={null}
        alwaysShowSend
        scrollToBottom
      />
    </View>
  );
};

const styles = StyleSheet.create({
  container: {
    flex: 1,
    backgroundColor: '#f5f5f5',
  },
  loadingContainer: {
    position: 'absolute',
    top: 10,
    left: 0,
    right: 0,
    zIndex: 1,
    alignItems: 'center',
    justifyContent: 'center',
    flexDirection: 'row',
    backgroundColor: 'rgba(255, 255, 255, 0.8)',
    padding: 10,
    borderRadius: 20,
    marginHorizontal: 20,
  },
  loadingText: {
    marginLeft: 10,
    fontSize: 16,
  },
});

export default ChatScreen;

### Step 15: Implement On-Device Model Integration


For iOS, create a Swift bridge to use the Core ML model:


In [None]:
// PhiModelManager.swift

import Foundation
import CoreML

class PhiModelManager {
    static let shared = PhiModelManager()
    
    private var model: MLModel?
    
    private init() {
        loadModel()
    }
    
    private func loadModel() {
        do {
            // Load the Core ML model
            let modelURL = Bundle.main.url(forResource: "PhiMiniModel", withExtension: "mlmodelc")!
            model = try MLModel(contentsOf: modelURL)
            print("Successfully loaded Phi-3 model")
        } catch {
            print("Error loading model: \(error)")
        }
    }
    
    func generateResponse(for prompt: String, completion: @escaping (String?, Error?) -> Void) {
        guard let model = model else {
            completion(nil, NSError(domain: "PhiModelError", code: 1, userInfo: [NSLocalizedDescriptionKey: "Model not loaded"]))
            return
        }
        
        // Prepare input
        guard let input = try? MLDictionaryFeatureProvider(dictionary: ["prompt": prompt as NSString]) else {
            completion(nil, NSError(domain: "PhiModelError", code: 2, userInfo: [NSLocalizedDescriptionKey: "Failed to create input"]))
            return
        }
        
        // Perform prediction
        DispatchQueue.global(qos: .userInitiated).async {
            do {
                let prediction = try model.prediction(from: input)
                
                // Extract text output
                if let textOutput = prediction.featureValue(for: "text")?.stringValue {
                    DispatchQueue.main.async {
                        completion(textOutput, nil)
                    }
                } else {
                    DispatchQueue.main.async {
                        completion(nil, NSError(domain: "PhiModelError", code: 3, userInfo: [NSLocalizedDescriptionKey: "No output generated"]))
                    }
                }
            } catch {
                DispatchQueue.main.async {
                    completion(nil, error)
                }
            }
        }
    }
}

## Phase 6: Deployment


### Step 16: Deploy Backend to Cloud Run


In [None]:
%%bash
# Build and push Docker image to Artifact Registry
gcloud builds submit --tag us-central1-docker.pkg.dev/phi3-chatbot-app/phi3-chatbot-repo/phi3-api:v1

# Deploy to Cloud Run
gcloud run deploy phi3-api \
    --image us-central1-docker.pkg.dev/phi3-chatbot-app/phi3-chatbot-repo/phi3-api:v1 \
    --platform managed \
    --region us-central1 \
    --memory 2Gi \
    --cpu 2 \
    --allow-unauthenticated

### Step 17: Set Up CI/CD with GitHub Actions


Create a `.github/workflows/deploy.yml` file:


### Step 18: Prepare Mobile App for Release


For iOS:


In [None]:
%%bash
# Configure app for production
cd ios
pod install

# Build the app for production
xcodebuild -workspace Phi3ChatbotApp.xcworkspace -scheme Phi3ChatbotApp -configuration Release -destination 'generic/platform=iOS' -archivePath Phi3ChatbotApp.xcarchive archive

# Create an IPA file
xcodebuild -exportArchive -archivePath Phi3ChatbotApp.xcarchive -exportOptionsPlist ExportOptions.plist -exportPath ./build

## Phase 7: Testing and Quality Assurance


### Step 19: Implement Automated Testing


Create a `test_api.py` file in the tests directory:


In [None]:
import pytest
from fastapi.testclient import TestClient
from api.main import app

client = TestClient(app)

def test_chat_endpoint_new_conversation():
    response = client.post(
        "/chat",
        json={"message": "Hello, how are you?"}
    )
    assert response.status_code == 200
    assert "conversation_id" in response.json()
    assert "response" in response.json()
    assert len(response.json()["response"]) > 0

def test_chat_endpoint_existing_conversation():
    # Create a conversation first
    response1 = client.post(
        "/chat",
        json={"message": "What is your name?"}
    )
    conversation_id = response1.json()["conversation_id"]
    
    # Continue the conversation
    response2 = client.post(
        "/chat",
        json={
            "conversation_id": conversation_id,
            "message": "What can you do?"
        }
    )
    assert response2.status_code == 200
    assert response2.json()["conversation_id"] == conversation_id
    assert "response" in response2.json()

### Step 20: Manual Testing Checklist


Create a `testing_checklist.md` file:


## Phase 8: Monitoring and Maintenance


### Step 21: Set Up Logging and Monitoring


Add logging to `main.py`:


In [None]:
import logging
from google.cloud import logging as cloud_logging

# Setup Cloud Logging
client = cloud_logging.Client()
client.setup_logging()

logger = logging.getLogger("phi3-chatbot")

@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
    logger.info(f"Received chat request with conversation_id: {request.conversation_id}")
    
    # ... existing code ...
    
    logger.info(f"Generated response of length {len(response_text)}")
    
    # ... existing code ...

### Step 22: Create a Monitoring Dashboard


Use Google Cloud Monitoring to create a dashboard:


In [None]:
%%bash
# Install Google Cloud Monitoring agent
gcloud beta run services update phi3-api \
    --update-env-vars="ENABLE_CLOUD_MONITORING=true"

# Create a custom dashboard (this would typically be done via UI)
# But you can use Terraform or gcloud to automate this


## Final Steps and Considerations

1. **Documentation**: Create comprehensive documentation for your API, model, and mobile app.

2. **User Feedback System**: Implement a feedback mechanism to collect user input for model improvements.

3. **Update Strategy**: Plan a strategy for model updates and new releases.

4. **Privacy Features**: Add encryption for on-device storage and privacy controls.

5. **Personalization Systems**: Implement a system to adapt the chatbot's responses based on user interactions.

This step-by-step guide covers the entire process of developing an on-device AI chatbot powered by Phi-3 mini, from initial model testing to deployment and maintenance. The implementation aligns with the project requirements of creating a privacy-focused, personalized companion that runs on-device.

Would you like me to expand on any particular phase or provide more details about specific technical aspects?