In [1]:
!pip install fastapi uvicorn nest-asyncio pyngrok python-multipart chromadb PyPDF langchain-community huggingface_hub langchain_huggingface

Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting chromadb
  Downloading chromadb-1.0.17-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting PyPDF
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (

In [2]:
import torch

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA

In [5]:
from fastapi import FastAPI, UploadFile,File
from fastapi.responses import JSONResponse
import nest_asyncio
from pyngrok import ngrok
import uvicorn

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [32]:
import tempfile, shutil, os
def document_loader(uploaded_file):
    # uploaded_file is already a SpooledTemporaryFile (from UploadFile.file)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        shutil.copyfileobj(uploaded_file, tmp_file)
        tmp_file_path = tmp_file.name

    loader = PyPDFLoader(tmp_file_path)
    docs = loader.load()

    os.remove(tmp_file_path)
    return docs

In [8]:
def text_splitter(data):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 1000,
      chunk_overlap= 20,
      length_function = len
  )
  chunks = text_splitter.split_documents(data)
  return chunks

In [9]:
def embeddings():
  embedd = HuggingFaceEmbeddings(
      model_name = "sentence-transformers/all-MiniLM-L6-v2",
      model_kwargs= {'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
      encode_kwargs = {'normalize_embeddings':True}
  )
  return embedd

In [10]:
def get_llm():

    tokenizer = AutoTokenizer.from_pretrained("facebook/KernelLLM")
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained("facebook/KernelLLM")
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=200,
        truncation=True
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

In [11]:
def vector_database(chunks):
  embedding_model = embeddings()
  vectordb = Chroma.from_documents(chunks, embedding=embedding_model)
  return vectordb

In [12]:
def retriever(file):
  data = document_loader(file)
  chunks = text_splitter(data)
  vectordb = vector_database(chunks)
  retriever = vectordb.as_retriever()
  print("retriever is ready")
  return retriever

In [13]:
llm = get_llm()
def retriever_qa(query,retriever_obj):
  qa_chain = RetrievalQA.from_chain_type(
      llm,
      retriever = retriever_obj,
      return_source_documents = False,
      chain_type = 'stuff'
  )
  answer = qa_chain.invoke(query)
  return answer

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

Device set to use cuda:0


In [37]:
!ngrok config add-authtoken "Paste your ngrok auth token"

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [34]:
from fastapi.middleware.cors import CORSMiddleware
app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["http://127.0.0.1:5500", "http://localhost:5000"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


In [35]:
from fastapi import Form
cached_filename = None
cached_retriever = None

@app.post("/answer")
async def answer(question: str = Form(...), file: UploadFile = File(None)):
    global cached_filename, cached_retriever
    if file:
        if cached_filename != file.filename:
            cached_filename = file.filename
            cached_retriever = retriever(file.file)
    elif cached_retriever is None:
        return JSONResponse(content={"answer": "Please upload a file first."})


    answer = retriever_qa(query=question, retriever_obj = cached_retriever)
    final_answer = answer["result"].split("Helpful Answer:")[-1].strip()

    return JSONResponse(content={"answer": final_answer})

In [36]:
# Setup for running inside Jupyter/Colab
nest_asyncio.apply()
ngrok.kill()
public_url = ngrok.connect(8000)
print("Public URL:", public_url)

# Run API
uvicorn.run(app, port=8000)

Public URL: NgrokTunnel: "https://94467b8c5841.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [5852]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-8' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:69> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/main.py", line 580, in run
    server.run()
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 67, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
  

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


retriever is ready
INFO:     103.179.246.169:0 - "POST /answer HTTP/1.1" 200 OK
INFO:     103.179.246.169:0 - "POST /answer HTTP/1.1" 200 OK
INFO:     103.179.246.169:0 - "POST /answer HTTP/1.1" 200 OK
INFO:     103.179.246.169:0 - "POST /answer HTTP/1.1" 200 OK
INFO:     103.179.246.169:0 - "POST /answer HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Finished server process [5852]
