In [2]:
!pip install fastapi uvicorn transformers torch bitsandbytes pydantic


Collecting fastapi
  Downloading fastapi-0.114.2-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting starlette<0.39.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.38.5-py3-none-any.whl.metadata (6.0 kB)
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading fastapi-0.114.2-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.0/94.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.30.6-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0

In [3]:
!pip install fastapi uvicorn transformers pydantic torch bitsandbytes
!pip install git+https://github.com/huggingface/transformers  # For remote code support

!pip install pyngrok


Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ybgf7dgb
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-ybgf7dgb
  Resolved https://github.com/huggingface/transformers to commit 763548427d028878f4d4d8fb6f0be57cc3915fbd
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.45.0.dev0-py3-none-any.whl size=9734743 sha256=a19ded1a28e7a140e6dd8d0cab4ebe41db34a3f7efa14b23594d08445d3a5248
  Stored in directory: /tmp/pip-ephem-wheel-cache-ccwinru0/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

Public URL: NgrokTunnel: "https://a31d-34-125-8-92.ngrok-free.app" -> "http://localhost:5000"


In [1]:
!pip install nest_asyncio




In [8]:
import nest_asyncio
import asyncio
from pyngrok import ngrok
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from fastapi.middleware.cors import CORSMiddleware

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()


In [None]:
# Initialize FastAPI app
app = FastAPI()

# CORS configuration
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allows all origins; adjust as needed
    allow_credentials=True,
    allow_methods=["*"],  # Allows all methods; adjust as needed
    allow_headers=["*"],  # Allows all headers; adjust as needed
)

# Define quantization configuration
use_4bit = True
bnb_4bit_quant_type = "nf4"
bnb_4bit_compute_dtype = "float16"
use_nested_quant = True
device_map = "auto"

# Setup quantization configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load model with quantization configuration
model_name = 'avi2135/avigpt2'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Input model for the API
class QuestionRequest(BaseModel):
    question: str

# Function to generate response for a given input
def generate_response(user_input):
    if user_input:
        inputs = tokenizer(user_input, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_length=80,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        start_marker = "[/]"
        if start_marker in response:
            response = response.split(start_marker)[-1]  # Get text after "[/]"
            response = response.split(".")[0]  # Get text before first full stop

        return response.strip()
    return ""

# FastAPI endpoint for asking questions
@app.post("/ask")
async def ask_question(req: QuestionRequest):
    answer = generate_response(req.question)
    return {"answer": answer}

# Launch FastAPI using Uvicorn and Ngrok
def start_ngrok():
    public_url = ngrok.connect(5000)
    print(f"Public URL: {public_url}")

async def start_server_async():
    config = uvicorn.Config(app, host="0.0.0.0", port=5000)
    server = uvicorn.Server(config)
    await server.serve()

def start_server():
    loop = asyncio.get_event_loop()
    loop.run_until_complete(start_server_async())

# Start ngrok and server
start_ngrok()
start_server()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-4' coro=<start_server_async() done, defined at <ipython-input-7-b0929d3b664d>:85> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-b0929d3b664d>", line 95, in <cell line: 94>
    start_server()
  File "<ipython-input-7-b0929d3b664d>", line 92, in start_server
    loop.run_until_complete(start_server_async())
  File "/usr/local/lib/python3.10/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.10/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/usr/lib/python3.10/asyncio/tasks.py", line 315, in __wa

Public URL: NgrokTunnel: "https://f211-34-125-8-92.ngrok-free.app" -> "http://localhost:5000"


INFO:     Started server process [15262]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:5000 (Press CTRL+C to quit)


INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK




INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "OPTIONS /ask HTTP/1.1" 200 OK
INFO:     103.93.240.40:0 - "POST /ask HTTP/1.1" 200 OK
