In [42]:
from pypdf import PdfReader
import torch
import PyPDF2
from io import BytesIO
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import gradio as gr
import time

from langchain.memory import ConversationBufferMemory


from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain.document_loaders import PyPDFDirectoryLoader

CHUNK_SIZE = 1000
# Using HuggingFaceEmbeddings with the chosen embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs = {"device": "cuda"})

# transformer model configuration
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


def load_llm():

    model_id = "Deci/DeciLM-6b"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                             trust_remote_code=True,
                                             device_map = "auto",
                                             quantization_config=quant_config)
    pipe = pipeline("text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    temperature=0,
                    num_beams=5,
                    no_repeat_ngram_size=4,
                    early_stopping=True,
                    max_new_tokens=50,
                )
    
    llm = HuggingFacePipeline(pipeline=pipe)

    return llm

In [2]:
pdf_paths = "/home/brianmutea/Gradio-app-Chat-with-Multiple-PDFs-using-LangChain-an-DeciLM-6b-instruct/PDFs"

loader = PyPDFDirectoryLoader(
    path= pdf_paths,
    glob="*.pdf"
)
documents=loader.load()

In [3]:
documents

[Document(page_content='Moûsai : Efficient Text-to-Music Diffusion Models\nFlavio Schneider∗\nETH Zürich\nflavio.schneider.97@gmail.comOjasv Kamal∗\nIIT Kharagpur\nkamalojasv2000@gmail.com\nZhijing Jin†\nMPI for Intelligent Systems & ETH Zürich\njinzhi@ethz.chBernhard Schölkopf†\nMPI for Intelligent Systems\nbs@tue.mpg.de\nAbstract\nRecent years have seen the rapid development\nof large generative models for text; however,\nmuch less research has explored the connec-\ntion between text and another “language” of\ncommunication – music . Music, much like text,\ncan convey emotions, stories, and ideas, and\nhas its own unique structure and syntax. In\nour work, we bridge text and music via a text-\nto-music generation model that is highly ef-\nficient, expressive, and can handle long-term\nstructure. Specifically, we develop Moûsai , a\ncascading two-stage latent diffusion model that\ncan generate multiple minutes of high-quality\nstereo music at 48kHz from textual descrip-\ntions. Moreov

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,
                                                chunk_overlap=100)

splits = text_splitter.split_documents(documents)

# length of all splits

print(f"We have, {len(splits)} chunks in memory")

We have, 119 chunks in memory


In [5]:
vectorstore_db = FAISS.from_documents(splits, embeddings)

In [6]:
retriever = vectorstore_db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [7]:
retrieved_relevant_docs = retriever.get_relevant_documents(
    "What is a Transformer?"
)

print(f"Retrieved documents: {len(retrieved_relevant_docs)}")
f"Page content of first document:\n {retrieved_relevant_docs[0].page_content}"

Retrieved documents: 6


'Page content of first document:\n Figure 1: The Transformer - model architecture.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\nEncoder: The encoder is composed of a stack of N= 6 identical layers. Each layer has two\nsub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-\nwise fully connected feed-forward network. We employ a residual connection [ 11] around each of\nthe two sub-layers, followed by layer normalization [ 1]. That is, the output of each sub-layer is\nLayerNorm( x+ Sublayer( x)), where Sublayer( x)is the function implemented by the sub-layer\nitself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding\nlayers, produce outputs of dimension dmodel = 512 .'

In [8]:
retrieved_relevant_docs = retriever.get_relevant_documents(
    "What is a melody?"
)

print(f"Retrieved documents: {len(retrieved_relevant_docs)}")
f"Page content of first document:\n {retrieved_relevant_docs[0].page_content}"

Retrieved documents: 6


'Page content of first document:\n repetitiveness of the melody. A more rhyth-\nmic and repetitive melody is considered better,\nwhile the opposite is true for a less rhythmic\nmelody.\n7.Harmoniousness involves multiple notes\nplayed together to support the melody. Evalu-\nate if these notes are in sync and enhance the\nFigure 7: Mel spectrogram comparison between the true\nsamples (top) and the auto-encoded samples (bottom);\ncf. text.\neffect of the melody. Higher scores should be\ngiven for good harmony and lower for poor\nharmony.\n8.It is recommended view youtube videos: this\nor this short video explaining melody and har-\nmony\n9.This folder also contains a spreadsheet by the\nname “Response_Task_1.xlsx”. Remember\nto provide ratings (out of 5) for each aspect\nof your evaluation in the file against appropri-\nate folder number. Feel free to listen to each\nsample as many times before rating them.\nD Exploring Variations of the Model\nArchitecture and Training Setup\nD.1 High-F

In [9]:
custom_prompt_template = """You are an assistant for question-answering tasks. Use the uploaded files to answer the question at the end.
If you don't know the answer just say you do not know and do not try to make up the answer. Keep the answer as concise as possible.
Context= {context}
History = {history}
Question= {question}
Helpful Answer:
"""

prompt = PromptTemplate(template=custom_prompt_template,
                        input_variables=["question", "context", "history"])

In [45]:
qa_chain_with_memory = RetrievalQA.from_chain_type(llm=load_llm(), chain_type='stuff',
                                                   retriever = vectorstore_db.as_retriever(),
                                                   return_source_documents = True,
                                                   chain_type_kwargs = {"prompt": prompt,
                                                                        "memory": ConversationBufferMemory(
                                                                            input_key="question",
                                                                            memory_key="history",
                                                                            return_messages=True)})

Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.09s/it]


In [44]:
query = "What is a decoder in transformers?"
qa_chain_with_memory.run({"query": query})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'The decoder is the part of the Transformer model that generates the answer to the question. It is composed of an encoder and a decoder.\nThe encoder is responsible for encoding the question into a representation that can be used by the'

In [35]:
qa_chain_with_memory({"query": "What are its components?"})["result"]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'1. Encoder\n2. Decoder\n3. Self-Attention\n4. Multi-Head Attention\n5. Residual Connection\n6. Layer Normalization\n7. Fully Connected Layers\n'

In [38]:
qa_chain_with_memory({"query": "What is a chair"})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'query': 'What is a chair',
 'result': 'A chair is a piece of furniture with a flat bottom and raised sides, used for sitting.',
 'source_documents': [Document(page_content='the Eleventh International Conference on Language Re-\nsources and Evaluation (LREC 2018) , Miyazaki, Japan.\nEuropean Language Resources Association (ELRA).\nJeanette Bicknell. 2002. Can music convey semantic\ncontent? a kantian approach. The Journal of Aesthetics\nand Art Criticism , 60(3):253–261.\nZalán Borsos, Raphaël Marinier, Damien Vincent,\nEugene Kharitonov, Olivier Pietquin, Matthew Shar-\nifi, Olivier Teboul, David Grangier, Marco Tagliasac-\nchi, and Neil Zeghidour. 2022. AudioLM: A lan-\nguage modeling approach to audio generation. CoRR ,\nabs/2209.03143.\nNicolas Boulanger-Lewandowski, Yoshua Bengio, and\nPascal Vincent. 2012. Modeling temporal dependencies\nin high-dimensional sequences: Application to poly-\nphonic music generation and transcription.\nTom Brown, Benjamin Mann, Nick Ryder, Melanie\