In [None]:
!pip -q install "transformers>=4.43" "accelerate>=0.33" "bitsandbytes>=0.43" "autoawq>=0.2.7" torch --extra-index-url https://download.pytorch.org/whl/cu121
!pip -q install fastapi uvicorn pyngrok 

In [None]:
from huggingface_hub import login
login("")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

base_id = "mistralai/Mistral-Nemo-Instruct-2407"
tok = AutoTokenizer.from_pretrained(base_id, use_fast=True)

In [None]:
bnb_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(base_id, quantization_config=bnb_cfg, device_map="auto")

In [None]:
save_dir = "mistral_nemo_instruct_2407_4bit_bnb"
try:
    model.save_pretrained(save_dir, safe_serialization=True)
    tok.save_pretrained(save_dir)
    print("Saved quantized model to:", save_dir)
except Exception as e:
    print("Save 4-bit not supported in this env:", e)

In [None]:
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

save_dir = "/kaggle/working/mistral_nemo_instruct_2407_4bit_bnb"

def load_model_and_tokenizer(path_or_id):
    bnb_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    tok = AutoTokenizer.from_pretrained(path_or_id, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        path_or_id,
        quantization_config=bnb_cfg,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
    return tok, model

In [None]:
if Path(save_dir).exists():
    print(f"Loading local 4-bit checkpoint: {save_dir}")
    tok, model = load_model_and_tokenizer(save_dir)
else:
    print("Local 4-bit checkpoint not found (saving likely not supported in this env).")
    print(f"Falling back to: {fallback_hub_id}")
    tok, model = load_model_and_tokenizer(fallback_hub_id)

In [None]:
def generate_text(prompt, max_new_tokens=300, temperature=0.7, top_p=0.95):
    inputs = tok(prompt, return_tensors="pt", max_length=3000)
    outputs = model.generate(
        **inputs, 
        max_new_tokens=1200,
        temperature=0.7,
        top_p=0.95,
        # do_sample=True,
        pad_token_id=tok.eos_token_id
    )
    text = tok.decode(outputs[0], skip_special_tokens=True)
    return text

In [None]:
NGROK_TOKEN = ""
API_KEY = ""

In [None]:
from fastapi import FastAPI, Request, HTTPException
import uvicorn, threading, time, socket
from pyngrok import ngrok, conf
import nest_asyncio

app = FastAPI()

@app.post("/generate")
async def gen(req: Request):
    if req.headers.get("authorization") != f"Bearer {API_KEY}":
        raise HTTPException(status_code=401, detail="Unauthorized")
    data = await req.json()
    return {
        "response": generate_text(
            data.get("prompt", "")
        )
    }

In [None]:
def free_port():
    s = socket.socket()
    s.bind(('', 0))
    port = s.getsockname()[1]
    s.close()
    return port

port = free_port()
conf.get_default().auth_token = NGROK_TOKEN
public_url = ngrok.connect(port).public_url
print("Your public URL:", public_url)

# def run(): 
#     uvicorn.run(app, host="0.0.0.0", port=port)

def run():
    import asyncio
    from uvicorn import Config, Server

    config = Config(app=app, host="0.0.0.0", port=port, log_level="info")
    server = Server(config)

    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    loop.run_until_complete(server.serve())


# nest_asyncio.apply()
threading.Thread(target=run, daemon=True).start()
time.sleep(1)