In [1]:
!pip install fastapi uvicorn unsloth transformers
!pip install pyngrok



In [2]:
!ngrok authtoken YOUR_TOKEN


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [18]:
%%writefile app.py
# (Paste the code here)

from fastapi import FastAPI, Request
from pydantic import BaseModel
import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='/content/model',  # Load from local folder
    max_seq_length=2048,
    load_in_4bit=True
    dtype=None
)

app = FastAPI()

class InputData(BaseModel):
    text: str

@app.post("/predict")
async def predict(data: InputData):
    try:
        # Validate input
        if not data.text or len(data.text.strip()) == 0:
            raise ValueError("Input text cannot be empty")

        messages = [
            {"role": "system", "content": "You are a lawyer with expert knowledge in legal cases. Generate the arguments for the given case."},
            {"role": "user", "content": data.text}
        ]

        # Use CPU if CUDA is not available
        device = "cuda" if torch.cuda.is_available() else "cpu"

        # Apply tokenization with proper error handling
        try:
            inputs = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(device)
        except Exception as e:
            raise ValueError(f"Tokenization failed: {str(e)}")

        # Create attention mask
        attention_mask = inputs.ne(tokenizer.pad_token_id).long()

        # Generate output
        outputs = model.generate(
            input_ids=inputs,
            attention_mask=attention_mask,
            max_new_tokens=2048,
            use_cache=True,
            temperature=0.7,
            min_p=0.1
        )

        # Decode outputs safely
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Extract response with better error handling
        decoded_output = decoded_outputs[0]
        if "assistant" in decoded_output:
            response = decoded_output.split("assistant", 1)[1].strip()
        else:
            response = decoded_output

        return {"prediction": response}

    except Exception as e:
        # Detailed error logging
        error_details = traceback.format_exc()
        print(f"Error: {e}\n{error_details}")

        # Return appropriate error response
        raise HTTPException(status_code=500, detail=str(e))

# Test endpoint to verify server is running
@app.get("/test")
async def test():
    return {"status": "API is running"}

Overwriting app.py


In [17]:
!killall ngrok

In [19]:
import subprocess
import time
from pyngrok import ngrok

# Start uvicorn in the background
subprocess.Popen(["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"])

# Wait briefly to ensure the server starts
time.sleep(3)

# Start ngrok
public_url = ngrok.connect(8000).public_url
print(f"Public URL: {public_url}")


Public URL: https://a9a7-35-247-150-255.ngrok-free.app
