In [1]:
!pip install -qqq bitsandbytes==0.40.0 --progress-bar off
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq transformers==4.30.0 --progress-bar off
!pip install -qqq accelerate==0.21.0 --progress-bar off
!pip install -qqq xformers==0.0.20 --progress-bar off
!pip install -qqq einops==0.6.1 --progress-bar off
!pip install -qqq langchain==0.0.233 --progress-bar off
!pip install unstructured --progress-bar off
!pip install chromadb --progress-bar off
!pip install --qqq sentence_transformers --progress-bar off

Collecting unstructured
  Downloading unstructured-0.8.1-py3-none-any.whl (1.4 MB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting msg-parser (from unstructured)
  Downloading msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)
Collecting pdf2image (from unstructured)
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Collecting pdfminer.six (from unstructured)
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
Collecting pypandoc (from unstructured)
  Downloading pypandoc-1.11-py3-none-any.whl (20 kB)
Collecting python-docx (from unstructured)
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-pptx (from unstructured)
  Downloading python-pptx-0.6.21.tar.gz (10.1 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting olefile>=0.

In [2]:
import warnings
from typing import List

import torch
from langchain import PromptTemplate
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.llms import HuggingFacePipeline
from langchain.schema import BaseOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
    pipeline,
    GenerationConfig,
    TextStreamer
)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so
/usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [3]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

Load the model

In [4]:
MODEL_NAME = "facebook/opt-350m"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, trust_remote_code=True, load_in_4bit=True, device_map="auto"
)

model = model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)

Downloading (…)lve/main/config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

ValueError: ignored

In [5]:
generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 2,
  "eos_token_id": 2,
  "pad_token_id": 1,
  "transformers_version": "4.30.0"
}

Note that we're loading the model in 8-bit mode. This will reduce the memory footprint and speed up the inference. We're also using the device_map parameter to load the model on the GPU.

Small Inference

In [6]:
Question = ("Which programming languages are more suited for beginners? ")

# prompt = f"""
#  ## Instruction : {Question}
#  ## Response :
#  """.strip()

prompt = """
The following is a friendly conversation between a human and an AI. The AI is
talkative and provides lots of specific details from its context.

Current conversation:

Human: Who is Spiderman?
AI:
""".strip()

In [7]:
input_ids = tokenizer(prompt,return_tensors='pt').input_ids
input_ids = input_ids.to(model.device)
# with torch.inference_model(output = model.generate(input=input_ids, temperature = 0.7, max_new_tokens = 1000))
output = model.generate(input_ids, temperature=0.7, max_new_tokens=1000)

In [8]:
print(input_ids)

tensor([[    2,   133,   511,    16,    10,  5192,  1607,   227,    10,  1050,
             8,    41,  4687,     4,    20,  4687,    16, 50118, 26594,  3693,
             8,  1639,  3739,     9,  2167,  1254,    31,    63,  5377,     4,
         50118,  1437, 50118, 42124,  1607,    35, 50118,  1437, 50118, 33837,
            35,  3394,    16, 15749,   397,   116, 50118, 15238,    35]],
       device='cuda:0')


In [9]:
print(tokenizer.decode(output[0]))

</s>The following is a friendly conversation between a human and an AI. The AI is
talkative and provides lots of specific details from its context.
 
Current conversation:
 
Human: Who is Spiderman?
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Spiderman.
AI: Sp

In [10]:
generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 2,
  "eos_token_id": 2,
  "pad_token_id": 1,
  "transformers_version": "4.30.0"
}

Embed the documents

In [21]:
!pip install sentence_transformers
import sentence_transformers
# embeddings = HuggingFaceEmbeddings(
#     model_name = "intfloat/e5-base-v2"
# )
embeddings = HuggingFaceEmbeddings()
#     model_name = "facebook/opt-350m"
# embeddings = HuggingFaceEmbeddings(
#     model_name="sentence-transformers/all-MiniLM-L6-v2"
# )
# embeddings = HuggingFaceEmbeddings(
#     model_name = "facebook/opt-350m"

# )

NotImplementedError: ignored

We'll use a custom configuration for the text generation:

In [None]:
generation_config = model.generation_config
generation_config.temperature = 0
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 256
generation_config.use_cache = False
generation_config.repetition_penalty = 1.7
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config

Setting the temperature to 0 to get deterministic results.  We'll also set the repetition_penalty to 1.7 to reduce the chance (but not completely remove the occurrences) of the model repeating itself.

In [None]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, use_multiprocessing=False)

Trying using the huggingface pipeline

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15,
    generation_config=generation_config,
    streamer=streamer,
    batch_size=1
)

In [None]:
llm=HuggingFacePipepine(pipeline=pipe)

In [None]:
response= llm(prompt  )

Loading the data

In [None]:
Path = "/content/pdfs"
loader = UnstructuredPDFLoader("/content/pdfs/bd.pdf")
loader

In [None]:
documents = loader.load_and_split()
documents

In [None]:
text_splitter = CharacterTextSplitter(chunk_size= 1000, chunk_overlap=20)
texts = text_splitter.split_documents(documents)

In [None]:
db = Chroma.from_documents(texts,embeddings)

In [None]:
# db.similarity_search()

##Conversational Chain

In [None]:
template= """
  You are a product analyst chatbot that is taling to product managers. use only the chat history and the following information
  {context}
  to answer in a helpful manner to the question. if you don't know the answer - say that you don't know.
  keep your replies compassionate and informative
  {chat history}
  ## Input : {question}
  ## Response :
""".strip()

In [None]:
prompt = PromptTemplate(input_variables=["context", "question", "chat_history"], template = template)

In [None]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    human_prefix="## Input",
    ai_prefix="## Response",
    output_key="answer",
    return_messages=True
)

In [None]:
chain = ConversationalretrievalChain.from_llm(
    llm,
    chain_type="stuff",
    retriever=db.as_retriever(),
    memory=memory,
    combine_docs_chain_kwargs={"prompt":prompt},
    return_source_documents=True,
    verbose=True
)

In [None]:
question = "What are rhe features of Brand Protector?"
answer = chain(question)

##QA Chain with Memory

In [None]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    human_prefix="## Input",
    ai_prefix="## Response",
    input_key="question"
    output_key="output_text",
    return_messages=False
)

In [None]:
chain = load_qa_chain(
    llm,
    chain_type="stuff",
    prompt=prompt,
    memory=memory,
    verbose=True
)

In [None]:
question = "What are rhe features of Brand Protector?"
docs = db.similarity_search(question)
answer = chain({"input_documents":docs, "question":question})

##Support Chatbot

In [None]:
DEFAULT_TEMPLATE = """
  ## Instruction : You are a product analyst chatbot that is taling to product managers. use only the chat history and the following information
  {context}
  to answer in a helpful manner to the question. if you don't know the answer - say that you don't know.
  keep your replies compassionate and informative
  {chat history}
  ## Input : {question}
  ## Response :
""".strip()

class Chatbot:
    def __init__(
        self,
        text_pipeline: HuggingFacePipeline,
        embeddings: HuggingFaceEmbeddings,
        documents_dir:Path,
        prompt_template: str = DEFAULT_TEMPLATE,
        verbose: bool = True
    ):
        prompt = PromptTemplate(
            input_variables=["context", "question", "chat_history"],
            template=prompt_template,
        )
        self.chain = self._create_chain(text_pipeline, prompt, verbose)
        self.db = self._embed_data(documents_dir, embeddings)

    def _create_chain(
        self,
        text_pipeline: HuggingFacePipeline,
        prompt= PromptTemplate,
        verbose: bool = True:
    ):
        memory = ConversationBufferMemory(
          memory_key="chat_history",
          human_prefix="## Input",
          ai_prefix="## Response",
          input_key="question"
          output_key="output_text",
          return_messages=False
        )

        return load_qa_chain(
            text_pipeline,
            chain_type = "stuff",
            prompt=prompt,
            memory=memory,
            verbose=verbose
        )

    def _embed_data(
        self, documents_dir: Path, embeddings: HuggingFaceEmbeddings
    ) -> Chroma:
        loader = loader = UnstructuredPDFLoader("/content/pdfs/bd.pdf")
        documents = loader.load_and_split()
        text_splitter = CharacterTextSplitter(chunk_size= 1000, chunk_overlap=20)
        texts = text_splitter.split_documents(documents)
        return Chroma.from_documents(texts, embeddings)


    def __call__(self, user_input: str) -> str:
      docs = self,db,similarity_search(user_input)
      return self.chain.run({"input_documents": docs, "question": user_input})

In [None]:
chatbot = Chatbot(llm, embeddings,"/content/pdfs/bd.pdf")

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

while True:
  user_input = input("You: ")
  if user_input.lower() in ["bye", "goodbye"]:
      break
  answer = chatbot(user_input)
  print()