In [1]:
!pip3 install --upgrade pip

In [2]:
!pip install ipywidgets

In [3]:
!pip3 install ragas==0.1.7

In [4]:
import subprocess
import sys

subprocess.check_call(
    [sys.executable, "-m", "pip", "install", "pysqlite3-binary"]
)
__import__("pysqlite3")
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")


In [5]:
!pip install -U transformers sentence-transformers

In [6]:
import os
os.environ["OPENAI_API_KEY"] = ""
os.environ['HUGGINGFACEHUB_API_TOKEN'] = ""

In [7]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
#from ragas.langchain.evalchain import RagasEvaluatorChain
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.chat_models.huggingface import ChatHuggingFace

In [8]:
import uuid
from langchain.retrievers.multi_vector import SearchType
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document

In [9]:
from langchain_chroma import Chroma
import chromadb
import chromadb.config

In [10]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [11]:
from langchain.storage import InMemoryByteStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.prompts import PromptTemplate
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [12]:
from ragas import evaluate
from ragas.metrics import answer_relevancy, context_relevancy, context_recall, faithfulness

In [13]:
from langchain.llms import HuggingFaceHub
from langchain.chat_models import ChatOpenAI


In [14]:
from datasets import Dataset, load_dataset
from tqdm import tqdm
import re
import ast
import pandas as pd

### Settings

In [15]:
config = {
        'model' : 'meta-llama/Meta-Llama-3-8B-Instruct', 
        #microsoft/Phi-3-mini-4k-instruct, mistralai/Mistral-7B-Instruct-v0.2,meta-llama/Meta-Llama-3-8B-Instruct
        'type': 'single', #'multi' or 'single'
    }

### Document Settings

In [16]:
loader = PyPDFLoader("./pdfdata_sum.pdf")
documents = loader.load()
len(documents)

In [17]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
len(texts)

In [18]:
embeddings = HuggingFaceEmbeddings(
    model_name='jhgan/ko-sbert-nli',
    model_kwargs={'device':'cuda'},
    encode_kwargs={'normalize_embeddings':True},
)
#db = FAISS.from_documents(texts, embeddings)
db = Chroma.from_documents(texts,embeddings)

### Model Setting

In [19]:
llm = HuggingFacePipeline.from_model_id(
    model_id=config['model'],
    task="text-generation",
    device=0,
    pipeline_kwargs={
        "max_new_tokens": 1024,
        "repetition_penalty": 1.03,
        "do_sample" : False
    },
)

### MultiVector Retrieval

In [20]:
def smaller_chunks(docs):
    # The vectorstore to use to index the child chunks
    vectorstore = Chroma(collection_name="full_documents", embedding_function=HuggingFaceEmbeddings(
    model_name='jhgan/ko-sbert-nli',
    model_kwargs={'device':'cuda'},
    encode_kwargs={'normalize_embeddings':True},
))
    #vectorstore = FAISS.from_texts(docs,embeddings)  
    
    # The storage layer for the parent documents
    store = InMemoryByteStore()
    id_key = "doc_id"

    # The retriever (empty to start)
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        byte_store=store,
        id_key=id_key,
        search_kwargs = {"k": 1}
    )

    doc_ids = [str(uuid.uuid4()) for _ in docs]
    
    # The splitter to use to create smaller chunks
    child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
    
    sub_docs = []
    for i, doc in enumerate(docs):
        _id = doc_ids[i]
        _sub_docs = child_text_splitter.split_documents([doc])
        for _doc in _sub_docs:
            _doc.metadata[id_key] = _id
            sub_docs.extend(_sub_docs)
        
    retriever.vectorstore.add_documents(sub_docs)
    retriever.docstore.mset(list(zip(doc_ids, docs)))

    return retriever

### Retrieval

In [22]:
if config['type'] == 'single': #벡터스토어
    retriever=db.as_retriever(search_type='mmr', search_kwargs={"k": 1})

elif config['type'] == 'multi': #멀티벡터
    retriever = smaller_chunks(texts)
    retriever.search_type = SearchType.mmr

In [23]:
print(config)

In [24]:
# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Answer should be written in KOREAN. 

Question: {question} 
Context: {context} 
Answer:
"""

prompt = PromptTemplate.from_template(template)

In [25]:
# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser()
)

### Preparing Evaluation Data

In [26]:
testset = load_dataset("csv", data_files="dataset_sum.csv")

In [27]:
questions = testset['train']['question']
context = testset['train']['contexts']
ground_truths = []
for i, q in enumerate(questions):
    ground_truth = testset['train']['ground_truth'][i]
    if ground_truth == None:
        ground_truth = 'None'
    ground_truths.append([ground_truth])

In [28]:
contexts = [ast.literal_eval(s) for s in context]
contexts

In [29]:
questions

In [30]:
ground_truths

### Preparing for Evaluation

In [31]:
def extract_answer(text):
    match = re.search(r'Answer:\n(.*?)(?=\n\n|Question|$)', text, re.DOTALL)
    if match:
        return match.group(1)
    else:
        return None

In [32]:
# Inference
answers = []
for query in tqdm(questions, desc="Processing queries"):
    query_context_answer = rag_chain.invoke(query)
    answer = extract_answer(query_context_answer)
    answers.append(answer)
    print(query_context_answer)
answers  

In [None]:
# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

### Save generated set as a cache data

In [None]:
df_data = pd.DataFrame.from_dict(data)
df_data

In [None]:
name='testset_' + config['model'][:6]+'_'+config['type']+'.csv'
name

In [None]:
df_data.to_csv(name,index=False)

In [None]:
dataset = Dataset.from_dict(data)

### Evaluation

In [None]:
result = evaluate(
    dataset = dataset, 
    metrics=[
        context_relevancy,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
    raise_exceptions=False
)

df = result.to_pandas()
df

In [None]:
result

In [None]:
name=config['model'][:6]+'_'+config['type']+'.csv'
name

In [None]:
df.to_csv(name,index=False)