# Workflow

![qa_flow](qa_flow.jpeg)

# Installing necessary modules


In [1]:
%pip install openai chromadb langchain tiktoken sentence_transformers InstructorEmbedding unstructured

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m71.7/73.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.4-py3-none-any.whl (402 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m402.6/402.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.249-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━

# Using DirectoryLoader for loading txt file


In [4]:
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader('/content/drive/MyDrive/knowledge/', glob="**/*.txt")
data = loader.load()

# Splitting the data


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
all_splits = text_splitter.split_documents(data)

# Using HuggingFace embeddings

In [1]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


# Converting and Stroring the embeddings using chromadb

In [6]:
#from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=all_splits, embedding=instructor_embeddings)

# Intializing the HuggingFaceHub model "bigscience/bloom"

In [7]:
from langchain.chains import RetrievalQA
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
# template = """Question: {question}
# Answer: Let's think step by step."""
# prompt = PromptTemplate(template=template, input_variables=["question"])
llm=HuggingFaceHub(repo_id="bigscience/bloom", model_kwargs={"temperature":0.1,"min_length":1000}, huggingfacehub_api_token='')

# Retriving the most matched documents using RetrivalQA

In [24]:
qa_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=vectorstore.as_retriever(),return_source_documents=True)

# Answering the questions and saving it in excel file

In [22]:
import pandas as pd
df=pd.read_excel("/content/drive/MyDrive/knowledge/SampleQuestions.xlsx")
questions=list(df['Question'])

In [28]:
df['answers']=''

In [29]:
for count,i in enumerate(questions):
  result = qa_chain({"query": i})
  #result = chat({"question": i})
  df['answers'][count]=result['result']

In [33]:
df.to_excel("/content/drive/MyDrive/branceasnwers.xlsx")

# For chatting we can use ConversationalRetrievalChain

In [9]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
from langchain.chains import ConversationalRetrievalChain

retriever = vectorstore.as_retriever()
chat = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

# for checking accuracy we can embedd the actual answers and find the cosine similarity with the predicted answer