In [None]:
# Natural Language to SQL Demo with Fine-tuned Mistral-7B
# ===================================================

# Install required packages
!pip install transformers accelerate google-cloud-bigquery ipywidgets torch



Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# Mistral 7B API Setup in Google Colab
# This notebook will:
# 1. Mount Google Drive to access your model
# 2. Set up the Mistral 7B model
# 3. Create a Flask API to serve the model
# 4. Make the API accessible through a public URL

# Step 1: Install necessary dependencies
!pip install flask flask-cors pyngrok transformers accelerate bitsandbytes
!pip install -q torch>=2.0.0
!pip install -q peft

# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 3: Import required libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
import os
import gc

# Step 4: Load the model and tokenizer
model_path = "/content/drive/MyDrive/mistral7b_sql_model"

# Load in 4-bit quantization for memory efficiency on the A100
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True,  # Use 4-bit quantization
    low_cpu_mem_usage=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

# Step 5: Set up Flask API
app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Define generation parameters
@app.route('/generate', methods=['POST'])
def generate():
    try:
        data = request.json
        prompt = data.get('prompt', '')
        max_length = data.get('max_length', 512)
        temperature = data.get('temperature', 0.7)
        top_p = data.get('top_p', 0.9)

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

        # Generate response
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True
            )

        # Decode and return generated text
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        response = {
            'generated_text': generated_text,
            'prompt': prompt
        }

        return jsonify(response)

    except Exception as e:
        return jsonify({'error': str(e)}), 500

# Health check endpoint
@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'ok', 'model': 'Mistral 7B SQL'})

# Step 6: Set up ngrok for public URL access
def start_ngrok():
    # Set your ngrok auth token if you have one
    ngrok.set_auth_token("2vwENqH9aDYP55mAOXkO0fkrZc1_5gmr2xyVjWrGbyzutKHaA")

    # Start ngrok tunnel to port 5000
    public_url = ngrok.connect(5000).public_url
    print(f" * ngrok tunnel started at {public_url}")
    print(f" * API accessible at {public_url}/generate")
    return public_url

# Step 7: Main function
if __name__ == '__main__':
    # Start ngrok tunnel
    public_url = start_ngrok()

    # Run Flask app
    app.run(host='0.0.0.0', port=5000)

Collecting flask-cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting pyngrok
  Downloading pyngrok-7.2.7-py3-none-any.whl.metadata (9.4 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-c

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

 * ngrok tunnel started at https://c754-104-154-139-77.ngrok-free.app
 * API accessible at https://c754-104-154-139-77.ngrok-free.app/generate
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [06/May/2025 23:06:34] "GET /health HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [06/May/2025 23:07:02] "GET /health HTTP/1.1" 200 -
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [06/May/2025 23:07:09] "POST /generate HTTP/1.1" 200 -
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [06/May/2025 23:07:12] "POST /generate HTTP/1.1" 200 -
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [06/May/2025 23:07:19] "POST /generate HTTP/1.1" 200 -
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
INFO:werkzeug:127.0.0.1 - - [06/May/2025 23:07:32] "POST /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [06/May/2025 23:07:57] "GET /health HTTP/1.1" 200 -
