In [None]:
# Cell 1: Install Dependencies
!pip install -q transformers accelerate torch bitsandbytes
!pip install -q flask flask-cors pyngrok
!pip install -q huggingface-hub
print("‚úÖ Dependencies installed")

In [None]:
# Cell 2: Setup Ngrok & Hugging Face
# Get ngrok token from https://dashboard.ngrok.com/get-started/your-authtoken
# Get HF token from https://huggingface.co/settings/tokens
NGROK_TOKEN = "YOUR_NGROK_TOKEN_HERE"  # ‚ö†Ô∏è REPLACE THIS
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN_HERE"  # ‚ö†Ô∏è REPLACE THIS (optional but recommended)

!ngrok authtoken {NGROK_TOKEN}
print("‚úÖ Ngrok configured")

# Login to Hugging Face (optional - needed for some gated models)
try:
    from huggingface_hub import login
    if HF_TOKEN != "YOUR_HUGGINGFACE_TOKEN_HERE":
        login(token=HF_TOKEN)
        print("‚úÖ Hugging Face authenticated")
    else:
        print("‚ö†Ô∏è Hugging Face token not provided (skipping login)")
except Exception as e:
    print(f"‚ö†Ô∏è HF login skipped: {e}")

In [None]:
# Cell 3: Create Flask Server with Llama Model
%%writefile llama_server.py

from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)
CORS(app)

model = None
tokenizer = None

def load_model():
    global model, tokenizer
    logger.info("üîÑ Loading FortyMiles Llama-3 Food/Nutrition Model...")
    
    try:
        # Use the 10-epoch trained model (best performance)
        model_name = "fortymiles/Llama-3-8B-sft-lora-food-nutrition-10-epoch"
        
        # Load tokenizer
        logger.info("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Load model with 4-bit quantization for faster inference
        logger.info("Loading model with 4-bit quantization...")
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True
        )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True
        )
        model.eval()
        
        device = next(model.parameters()).device
        logger.info(f"‚úÖ Llama-3 Food/Nutrition Model loaded successfully on {device}!")
        return True
    except Exception as e:
        logger.error(f"‚ùå Failed to load model: {e}")
        return False

@app.route('/health', methods=['GET'])
def health():
    return jsonify({
        "status": "healthy",
        "model_loaded": model is not None,
        "model_name": "fortymiles/Llama-3-8B-sft-lora-food-nutrition-10-epoch",
        "device": str(next(model.parameters()).device) if model else "not loaded"
    })

@app.route('/generate', methods=['POST'])
def generate():
    global model, tokenizer
    
    if model is None:
        return jsonify({"error": "Model not loaded"}), 500
    
    try:
        data = request.json
        prompt = data.get('prompt', '')
        max_tokens = data.get('max_tokens', 1024)
        temperature = data.get('temperature', 0.7)
        top_p = data.get('top_p', 0.9)
        
        logger.info(f"üìù Generating (max_tokens={max_tokens})...")
        
        # Tokenize
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            max_length=2048,
            truncation=True
        )
        
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1
            )
        
        # Decode
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract response part (after ### Response:)
        if "### Response:" in response:
            response = response.split("### Response:")[-1].strip()
        elif prompt in response:
            # Remove the prompt from response
            response = response.replace(prompt, "").strip()
        
        logger.info(f"‚úÖ Generated {len(response)} characters")
        
        return jsonify({
            "status": "success",
            "response": response,
            "response_length": len(response)
        })
        
    except Exception as e:
        logger.error(f"‚ùå Error: {e}")
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    if load_model():
        public_url = ngrok.connect(5000)
        print("\n" + "="*60)
        print("üåê PUBLIC URL (COPY THIS):")
        print(f"   {public_url}")
        print("="*60)
        print("\nüìù Update your local ml_recommender.py:")
        print(f'   USE_COLAB = True')
        print(f'   COLAB_API_URL = "{public_url}"')
        print("\nüöÄ Server starting...\n")
        app.run(host='0.0.0.0', port=5000)
    else:
        print("‚ùå Failed to load model")

In [None]:
# Cell 4: Start Server (KEEP THIS RUNNING)
# This cell will run continuously. Don't stop it!
# Model download: ~16GB (first time only, then cached)
# Loading time: 3-5 minutes
!python llama_server.py