In [21]:
!pip install langchain_openai python-dotenv streamlit langchain_community langserve fastapi uvicorn sse_starlette bs4 chromadb faiss-cpu gradio pypdf src



In [38]:
# fix numpy in colab
import numpy
from IPython.display import clear_output

# fix triton in colab
!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

!git clone https://github.com/dvmazur/mixtral-offloading.git --quiet
!cd mixtral-offloading && pip install -q -r requirements.txt
!huggingface-cli download lavawolfiee/Mixtral-8x7B-Instruct-v0.1-offloading-demo --quiet --local-dir Mixtral-8x7B-Instruct-v0.1-offloading-demo

clear_output()

In [44]:
import sys

sys.path.append("mixtral-offloading")
import torch
from torch.nn import functional as F
from hqq.core.quantize import BaseQuantizeConfig
from huggingface_hub import snapshot_download
from IPython.display import clear_output
from tqdm.auto import trange
from transformers import AutoConfig, AutoTokenizer
from transformers.utils import logging as hf_logging
from transformers import TextStreamer

from src.build_model import OffloadConfig, QuantConfig, build_model

In [24]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import WebBaseLoader
import bs4
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from openai import OpenAI

import gradio as gr
import random
import time
import re

from google.colab import userdata

In [42]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
# client = OpenAI(api_key=userdata.get('OPENAPIKEY'))


Initialize LLM

In [43]:
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
quantized_model_name = "lavawolfiee/Mixtral-8x7B-Instruct-v0.1-offloading-demo"
state_path = "Mixtral-8x7B-Instruct-v0.1-offloading-demo"

config = AutoConfig.from_pretrained(quantized_model_name)

device = torch.device("cuda:0")

##### Change this to 5 if you have only 12 GB of GPU VRAM #####
offload_per_layer = 4
# offload_per_layer = 5
###############################################################

num_experts = config.num_local_experts

offload_config = OffloadConfig(
    main_size=config.num_hidden_layers * (num_experts - offload_per_layer),
    offload_size=config.num_hidden_layers * offload_per_layer,
    buffer_size=4,
    offload_per_layer=offload_per_layer,
)


attn_config = BaseQuantizeConfig(
    nbits=4,
    group_size=64,
    quant_zero=True,
    quant_scale=True,
)
attn_config["scale_quant_params"]["group_size"] = 256


ffn_config = BaseQuantizeConfig(
    nbits=2,
    group_size=16,
    quant_zero=True,
    quant_scale=True,
)
quant_config = QuantConfig(ffn_config=ffn_config, attn_config=attn_config)


model = build_model(
    device=device,
    quant_config=quant_config,
    offload_config=offload_config,
    state_path=state_path,
)



config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]



Loading experts:   0%|          | 0/32 [00:00<?, ?it/s]

Calling LLM

In [59]:
import warnings

def generate (user_input):
  # https://colab.research.google.com/github/dvmazur/mixtral-offloading/blob/master/notebooks/demo.ipynb#scrollTo=Zf4GkspecSm8

  warnings.filterwarnings("ignore", category=FutureWarning, message=".*resume_download.*")

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
  past_key_values = None
  sequence = None

  user_entry = dict(role="user", content=user_input)
  input_ids = tokenizer.apply_chat_template([user_entry], return_tensors="pt").to(device)

  if past_key_values is None:
    attention_mask = torch.ones_like(input_ids)
  else:
    seq_len = input_ids.size(1) + past_key_values[0][0][0].size(1)
    attention_mask = torch.ones([1, seq_len - 1], dtype=torch.int, device=device)

  result = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    past_key_values=past_key_values,
    streamer=streamer,
    do_sample=True,
    temperature=0.9,
    top_p=0.9,
    max_new_tokens=200,
    pad_token_id=tokenizer.eos_token_id,
    return_dict_in_generate=True,
    output_hidden_states=True,
  )
  response = tokenizer.batch_decode(result)[0]
  return response

Code for Web link

In [None]:
def web_link(link=None,query=None):

  if(link):
    try:
      loader = WebBaseLoader(web_path=(link,), bs_kwargs=dict(parse_only = bs4.SoupStrainer(
        # class_ = ("post-header", "post-content")
        class_ = ("mw-content-ltr mw-parser-output")
      )))

      webpagedoc = loader.load()

      text_splitter = RecursiveCharacterTextSplitter(chunk_size = 150, chunk_overlap = 50)

      global documents
      documents = text_splitter.split_documents(webpagedoc)

      #vector embedding and storage
      global db
      db = "DB"

      db = Chroma.from_documents(documents, OpenAIEmbeddings(openai_api_key = userdata.get('OPENAPIKEY')))

      return "Link Read"

    except Exception as e:
      return str(e)

  else:
      result = "No Data"
      response = ""
      if(result=="No Data"):
        result = db.similarity_search(query)
        response = result[0].page_content
        first = response
        for i in range(len(result)):
          if first != result[i].page_content:
            response += "\n" + result[i].page_content
            break


        # query = query + response

        # stream = client.chat.completions.create(
        #   model="gpt-3.5-turbo",
        #   messages=[{"role": "user", "content": query}],
        #   stream=True,
        # )
      # for chunk in stream:
      #   if chunk.choices[0].delta.content is not None:
      #       response = response + chunk.choices[0].delta.content
      # pass

      return response

Code from PDF file

In [61]:
def pdf_file(path, message):
  #implement pdf query logic here

  loader = PyPDFLoader(path)
  pdfdocs = loader.load()

  text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 50)
  pdfdoc = text_splitter.split_documents(pdfdocs)

  db = Chroma.from_documents(pdfdoc[:50], OpenAIEmbeddings(openai_api_key = userdata.get('OPENAPIKEY')))

  query = message

  result = db.similarity_search(query)

  response = result[0].page_content

  query = query + "\nContext:" + response

  #   # Retrieve the top 5 similar results instead of just one
  # results = db.similarity_search(query, k=5)

  # # Combine the content of the top 5 results
  # combined_response = "\n\n".join([result.page_content for result in results])

  # # Incorporate the combined context into the query
  # query_with_context = query + "\n\nContext:\n" + combined_response

  # # Generate response using the query with combined context
  # response = generate(query_with_context)

  # stream = client.chat.completions.create(
  #       model="gpt-3.5-turbo",
  #       messages=[{"role": "user", "content": query}],
  #       stream=True,
  #   )
  # for chunk in stream:
  #     if chunk.choices[0].delta.content is not None:
  #         response = response + chunk.choices[0].delta.content
  # response = generate (query)

  return response

inferred 

Code for Text file

In [None]:
def text_file(path, message):
  #implement text query logic here

  loader = TextLoader(path)
  text_documents = loader.load()
  text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 200)
  documents = text_splitter.split_documents(text_documents)

  db = Chroma.from_documents(documents, OpenAIEmbeddings(openai_api_key = userdata.get('OPENAPIKEY')))

  query = message

  result = db.similarity_search(query)

  response = result[0].page_content

  # query = query + response

  # stream = client.chat.completions.create(
  #       model="gpt-3.5-turbo",
  #       messages=[{"role": "user", "content": query}],
  #       stream=True,
  #   )
  # for chunk in stream:
  #     if chunk.choices[0].delta.content is not None:
  #         response = response + chunk.choices[0].delta.content

  return response

Interface

In [62]:
########################### GLOBAL VARIABLES ####################################
user_mode = None
path_file = None    #path to the current file uploaded

############################## FUNCTIONS ########################################
def echo(message, history):
    #message holds the input query from the user
    #history holds the chat history in the form of a list containing lists.
    #The inner list structure: [{User text},{Bot text}]

    print(history)

    #No mode
    if(user_mode==None):
      return "Please select an option"

    #web link
    if(user_mode==3):
      url_pattern = re.compile(r'https?://\S+')

      # Search for a URL in the message text
      matchCon = url_pattern.search(message)

      if matchCon:
          url = matchCon.group(0)  # Extract the matched URL
          response = web_link(link=url)
      else:
          #function to give query to appropriate web link function and return response from it
          response = web_link(query=message)

      return response

    #text file mode
    if(user_mode==2):
      return text_file(path_file, message) #path to current .txt file uploaded for this mode

    #pdf file mode
    if(user_mode==1):
      return pdf_file(path_file, message)  #path to current .pdf file uploaded for this mode

#---------------------------------------------------------------------------------------------#

def process_files(argument):

    global path_file
    path_file = argument

    return gr.File(label="Upload File", visible=True,interactive=True)

#---------------------------------------------------------------------------------------------#

def change_mode(choice):
    global user_mode

    if choice == "Web Link":
        user_mode = 3
        return gr.File(visible=False), gr.Button("Submit", visible=False), gr.Button("Cancel", visible=False)
    elif choice == "Text File":
        user_mode = 2
        return gr.File(label="Upload File",visible=True), gr.Button("Submit", visible=True), gr.Button("Cancel", visible=True)
    else:
        user_mode = 1
        return gr.File(label="Upload File",visible=True), gr.Button("Submit", visible=True), gr.Button("Cancel", visible=True)

#--------------------------------------------------------------------------------------------#

def cancel_upload():
    return gr.File(label="Upload File", visible=True,interactive=True)

############################## INTERFACE CODE ###################################
with gr.Blocks() as demo:
    #set up radio element and file input
    with gr.Row():
      radio = gr.Radio(
          ["PDF File", "Text File", "Web Link"], label="Select Mode"
      )

      # input = gr.Interface(process_files,inputs='files',outputs=None)
      with gr.Column():
        file_input = gr.File(label="Upload File", visible=False,interactive=True)

        #set up buttons
        with gr.Row():
            submit_btn = gr.Button("Submit", visible=False)
            cancel_btn = gr.Button("Cancel", visible=False)


    #add radio element event listener
    radio.change(fn=change_mode, inputs=radio, outputs=[file_input,submit_btn,cancel_btn])

    # Set up submit button to process files
    submit_btn.click(fn=process_files, inputs=[file_input], outputs=[file_input])

    # Set up cancel button to clear file input
    cancel_btn.click(fn=cancel_upload, inputs=None, outputs=file_input)

    #set up Chat Interface
    gr.ChatInterface(
        fn=echo,
        title="Doc Bot",
    )

demo.launch(debug=True)

LLM, Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
its Running on public URL: https://aec8acc544384db240.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


training process, and the knowledge database it uses for augmentation. The capabilities of the attacker include the ability to manipulate the knowledge database used by the LLM.

[]
The key threat here is that the
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://aec8acc544384db240.gradio.live




In [None]:
def(input):
  //vectorize the input
  outpuit  = model.predict(input)

  return output