In [1]:
!pip install fastapi uvicorn unsloth transformers
!pip install pyngrok

Collecting fastapi
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting unsloth
  Downloading unsloth-2025.3.16-py3-none-any.whl.metadata (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.1-py3-none-any.whl.metadata (6.2 kB)
Collecting unsloth_zoo>=2025.3.14 (from unsloth)
  Downloading unsloth_zoo-2025.3.15-py3-none-any.whl.metadata (17 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.17-py3-none-any.whl.metadata (9.5 kB)
Collecting datasets>=2.16.0 (fro

Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3


In [2]:
!ngrok authtoken 2uX3OmBHngy9xgiZVxqRgPenm2h_3afotyJ8T7k2n92SH3dLv

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [107]:
%%writefile app.py
# (Paste the code here)
from fastapi.middleware.cors import CORSMiddleware
from fastapi import FastAPI, Request
from pydantic import BaseModel
import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer




model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='/content/model',  # Load from local folder
    max_seq_length=2048,
    load_in_4bit=True,
    dtype=None
)

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allows all origins (not recommended for production)
    allow_credentials=True,
    allow_methods=["*"],  # Allows all HTTP methods (GET, POST, etc.)
    allow_headers=["*"],  # Allows all headers
)

class InputData(BaseModel):
    text: str

@app.post("/predict")
async def predict(data: InputData):
    try:
        # Validate input
        if not data.text or len(data.text.strip()) == 0:
            raise ValueError("Input text cannot be empty")

        messages = [
            {"role": "system", "content": "You are a lawyer with expert knowledge in legal cases. Generate the arguments for the given case."},
            {"role": "user", "content": data.text}
        ]

        # Use CPU if CUDA is not available
        device = "cuda" if torch.cuda.is_available() else "cpu"

        # Apply tokenization with proper error handling
        try:
            inputs = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(device)
        except Exception as e:
            raise ValueError(f"Tokenization failed: {str(e)}")

        # Create attention mask
        attention_mask = inputs.ne(tokenizer.pad_token_id).long()

        # Generate output
        outputs = model.generate(
        input_ids=inputs,
        attention_mask=attention_mask,
        max_new_tokens=2048,
        use_cache=True,
        temperature=0.7,
        min_p=0.1,
        # streaming=True  # Enables token-by-token streaming
        )


        # Decode outputs safely
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Extract response with better error handling
        decoded_output = decoded_outputs[0]
        if "assistant" in decoded_output:
            response = decoded_output.split("assistant", 1)[1].strip()
        else:
            response = decoded_output

        return {"prediction": response}

    except Exception as e:
        # Detailed error logging
        error_details = traceback.format_exc()
        print(f"Error: {e}\n{error_details}")

        # Return appropriate error response
        raise HTTPException(status_code=500, detail=str(e))

# Test endpoint to verify server is running
@app.get("/test")
async def test():
    return {"status": "API is running"}

Overwriting app.py


In [103]:
!killall ngrok

In [104]:
!pkill -f uvicorn

In [108]:
import threading
import uvicorn
from pyngrok import ngrok
import time

def run_uvicorn():
  !uvicorn app:app --host 0.0.0.0 --port 8000 --reload

# Run Uvicorn in a separate thread
threading.Thread(target=run_uvicorn, daemon=True).start()

# Wait briefly for the server to start
time.sleep(3)

# Start ngrok
public_url = ngrok.connect(8000).public_url
print(f"Public URL: {public_url}")


[32mINFO[0m:     Will watch for changes in these directories: ['/content']
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m13613[0m] using [36m[1mStatReload[0m
Public URL: https://afe6-35-233-253-104.ngrok-free.app


In [110]:
!lsof -i:8000

COMMAND   PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
uvicorn 13613 root    3u  IPv4 353971      0t0  TCP *:8000 (LISTEN)
python3 13619 root    3u  IPv4 353971      0t0  TCP *:8000 (LISTEN)
