# 🚀 Mistral Fairness Model - Cloud Inference Server

Run your trained Mistral model inference server in Google Colab with GPU acceleration!

**Features:**
- FastAPI server with GPU acceleration
- Public URL via ngrok for external access
- Optimized for fairness and politeness responses
- Direct integration with your Next.js app

**Setup Steps:**
1. Upload your trained model to Google Drive
2. Run all cells below
3. Copy the ngrok URL and update your Next.js app
4. Start chatting with your trained model!

In [None]:
# 🔧 Setup Environment
!pip install fastapi uvicorn[standard] transformers torch peft accelerate bitsandbytes
!pip install pyngrok

# Mount Google Drive to access your trained model
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# 🌐 Setup ngrok for public access
from pyngrok import ngrok
import getpass

# Get your ngrok authtoken (free account at https://ngrok.com)
print("Get your free ngrok authtoken at: https://dashboard.ngrok.com/get-started/your-authtoken")
ngrok_token = getpass.getpass("Enter your ngrok authtoken: ")
ngrok.set_auth_token(ngrok_token)


In [None]:
# 🤖 FastAPI Inference Server (GPU Optimized)
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import uvicorn
from typing import Optional
import logging
import time
import threading

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(
    title="Fairness ChatBot API - Cloud",
    description="AI model trained for fairness and politeness - Running on GPU!",
    version="1.0.0"
)

# Enable CORS for all origins (since we're using ngrok)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins for ngrok
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Request/Response models
class ChatRequest(BaseModel):
    message: str
    persona: Optional[str] = ""
    temperature: Optional[float] = 0.7
    max_length: Optional[int] = 50

class ChatResponse(BaseModel):
    response: str
    model: str
    fairness_enabled: bool
    processing_time_ms: int
    gpu_used: bool

class HealthResponse(BaseModel):
    status: str
    model_loaded: bool
    gpu_available: bool
    device: str

class CloudFairnessChatBot:
    def __init__(self, model_path: str):
        """Load the fairness-trained Mistral model with GPU acceleration"""
        logger.info(f"Loading fairness Mistral model from {model_path}...")
        
        # Check GPU availability
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Using device: {self.device}")
        
        try:
            base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
            
            logger.info(f"Loading base model: {base_model_name}")
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                low_cpu_mem_usage=True,
                device_map="auto" if self.device == "cuda" else "cpu",
                load_in_4bit=True if self.device == "cuda" else False,
                trust_remote_code=True
            )
            
            logger.info(f"Loading LoRA adapters from: {model_path}")
            self.model = PeftModel.from_pretrained(base_model, model_path)
            self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
            
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            self.model.eval()
            
            if self.device == "cuda":
                self.model = self.model.cuda()
            
            logger.info(f"✅ Fairness Mistral model loaded successfully on {self.device}!")
            
        except Exception as e:
            logger.error(f"Failed to load model: {e}")
            raise e
    
    def generate_response(self, prompt: str, persona_context: str = "", max_length: int = 50, temperature: float = 0.7) -> str:
        """Generate a fair and polite response with GPU acceleration"""
        # Format prompt for Mistral
        if persona_context.strip():
            persona_lines = persona_context.split('\n')
            persona_role = "Assistant"  # Default
            for line in persona_lines:
                if "You are" in line or "Act as" in line:
                    persona_role = line.replace("You are", "").replace("Act as", "").strip()
                    break
            
            formatted_prompt = f"[INST]As a {persona_role}, respond politely and fairly to: {prompt}[/INST]"
        else:
            formatted_prompt = f"[INST]Respond politely and fairly to: {prompt}[/INST]"
        
        inputs = self.tokenizer.encode(formatted_prompt, return_tensors="pt")
        if self.device == "cuda":
            inputs = inputs.cuda()
        
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_length=len(inputs[0]) + max_length,
                temperature=temperature,
                do_sample=True,
                top_p=0.9,
                top_k=50,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                repetition_penalty=1.1,
                no_repeat_ngram_size=3,
                early_stopping=True
            )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.replace(formatted_prompt, "").strip()
        
        # Clean up response
        response = response.split("\nHuman:")[0].strip()
        response = response.split("\nAssistant:")[0].strip()
        
        return response

# Global bot instance
bot = None

@app.post("/chat", response_model=ChatResponse)
async def chat(request: ChatRequest):
    if bot is None:
        raise HTTPException(status_code=503, detail="Model not loaded")
    
    start_time = time.time()
    
    try:
        response = bot.generate_response(
            request.message,
            request.persona,
            request.max_length,
            request.temperature
        )
        
        processing_time = int((time.time() - start_time) * 1000)
        
        return ChatResponse(
            response=response,
            model="mistral-7b-fairness",
            fairness_enabled=True,
            processing_time_ms=processing_time,
            gpu_used=torch.cuda.is_available()
        )
    except Exception as e:
        logger.error(f"Error generating response: {e}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health", response_model=HealthResponse)
async def health():
    return HealthResponse(
        status="healthy" if bot is not None else "loading",
        model_loaded=bot is not None,
        gpu_available=torch.cuda.is_available(),
        device="cuda" if torch.cuda.is_available() else "cpu"
    )

@app.get("/")
async def root():
    return {"message": "Fairness ChatBot API - Cloud Edition", "gpu_available": torch.cuda.is_available()}

print("✅ FastAPI server code loaded!")


In [None]:
# 📁 Load Your Trained Model
# Update this path to point to your trained model in Google Drive
MODEL_PATH = "/content/drive/MyDrive/chatbot_models/mistral_fairness_model"  # Update this path!

print(f"Loading model from: {MODEL_PATH}")
try:
    bot = CloudFairnessChatBot(MODEL_PATH)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("\n📝 Make sure to:")
    print("1. Upload your trained model to Google Drive")
    print("2. Update the MODEL_PATH variable above")
    print("3. Re-run this cell")


In [None]:
# 🧪 Test Your Model
if bot is not None:
    test_message = "You people are always so slow and incompetent!"
    test_persona = "You are a professional hotel receptionist."
    
    print(f"Test Input: {test_message}")
    print(f"Persona: {test_persona}")
    print("\n" + "="*50)
    
    response = bot.generate_response(test_message, test_persona)
    print(f"Model Response: {response}")
    print("\n✅ Model is working correctly!")
else:
    print("❌ Model not loaded. Please run the previous cell first.")


In [None]:
# 🚀 Start the FastAPI Server with ngrok
import nest_asyncio
import threading
import time

nest_asyncio.apply()

def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

# Start server in background thread
server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()

# Wait for server to start
time.sleep(5)

# Create ngrok tunnel
public_url = ngrok.connect(8000)
print(f"\n🌐 Your FastAPI server is now running at:")
print(f"Public URL: {public_url}")
print(f"\n📋 API Endpoints:")
print(f"• Health Check: {public_url}/health")
print(f"• Chat: {public_url}/chat (POST)")
print(f"• API Docs: {public_url}/docs")
print(f"\n🔗 Update your Next.js app to use: {public_url}")
print(f"\n⚡ GPU Acceleration: {'✅ Enabled' if torch.cuda.is_available() else '❌ CPU Only'}")
print(f"\n🎯 Your trained fairness model is now accessible from anywhere!")


In [None]:
# 🧪 Test the API via HTTP
import requests
import json

# Get the current ngrok URL
tunnels = ngrok.get_tunnels()
if tunnels:
    api_url = str(tunnels[0].public_url)
    
    # Test health endpoint
    health_response = requests.get(f"{api_url}/health")
    print("Health Check:")
    print(json.dumps(health_response.json(), indent=2))
    
    # Test chat endpoint
    chat_data = {
        "message": "This service is terrible and you people don't know what you're doing!",
        "persona": "You are a professional customer service representative.",
        "temperature": 0.7,
        "max_length": 50
    }
    
    chat_response = requests.post(f"{api_url}/chat", json=chat_data)
    print("\nChat Response:")
    print(json.dumps(chat_response.json(), indent=2))
    
    print(f"\n✅ API is working! Use this URL in your Next.js app: {api_url}")
else:
    print("❌ No ngrok tunnels found. Please run the previous cell first.")


In [None]:
## 🔗 Integration with Your Next.js App

Update your Next.js API route to use the ngrok URL:

```typescript
// app/api/trained-model/route.ts
const CLOUD_MODEL_URL = "YOUR_NGROK_URL_HERE"; // Copy from above

export async function POST(request: Request) {
  try {
    const { message, persona, temperature = 0.7, max_length = 50 } = await request.json();
    
    const response = await fetch(`${CLOUD_MODEL_URL}/chat`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
      },
      body: JSON.stringify({
        message,
        persona,
        temperature,
        max_length
      })
    });
    
    const data = await response.json();
    return Response.json(data);
  } catch (error) {
    return Response.json(
      { error: 'Failed to get response from trained model' },
      { status: 500 }
    );
  }
}
```

## 📝 Notes

- **Free GPU**: Google Colab provides free GPU access (T4)
- **Session Limits**: Free tier has ~12 hours of continuous usage
- **Persistent URLs**: ngrok URLs change each time you restart
- **Performance**: GPU inference is ~10-20x faster than CPU
- **Scaling**: For production, consider upgrading to Colab Pro or using cloud platforms

## 🔄 Keeping It Running

The server will keep running as long as this Colab session is active. To keep it running longer:
1. Keep the browser tab open
2. Interact with the notebook occasionally
3. Consider Colab Pro for longer sessions
4. For production, deploy to a proper cloud service
