## Notebook experiment 

In [44]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.parsers import RapidOCRBlobParser
from langchain_community.document_loaders.parsers import TesseractBlobParser
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pymupdf4llm
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.retrievers import BM25Retriever, EnsembleRetriever


In [32]:
loader = PyMuPDFLoader(file_path = "/Users/arunekambaram/Desktop/RAG-ChatBot/data/GPT-4_VS_Human_translators (1).pdf",
                       mode="single",
                       extract_images = True,
                       images_parser =RapidOCRBlobParser(),
                       images_inner_format = "html-img",
                       extract_tables = 'markdown',

)


In [34]:
docs = loader.load()
print(docs[0].page_content)

GPT-4 vs. Human Translators: A Comprehensive Evaluation of
Translation Quality Across Languages, Domains, and Expertise Levels
Jianhao Yan1,2∗
Pingchuan Yan3∗
Yulong Chen4∗
Judy Li5
Xianchao Zhu5
Yue Zhang2,6, 1 Zhejiang University
2 School of Engineering, Westlake University
3 University College London
4 University of Cambridge
5 Lan-Bridge Group
6 Institute of Advanced Technology, Westlake Institute for Advanced Study
elliottyan37@gmail.com
Abstract
This study comprehensively evaluates the
translation quality of Large Language Mod-
els (LLMs), specifically GPT-4, against hu-
man translators of varying expertise lev-
els across multiple language pairs and do-
mains. Through carefully designed annota-
tion rounds, we find that GPT-4 performs
comparably to junior translators in terms of
total errors made but lags behind medium
and senior translators. We also observe the
imbalanced performance across different lan-
guages and domains, with GPT-4’s transla-
tion capability gradually weake

In [37]:
spiltter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap= 150,
    separators=["\n\n","\n","."," "]
)

In [42]:
chunks = spiltter.split_documents([docs[0]])
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-07-08T00:23:43+00:00', 'source': '/Users/arunekambaram/Desktop/RAG-ChatBot/data/GPT-4_VS_Human_translators (1).pdf', 'file_path': '/Users/arunekambaram/Desktop/RAG-ChatBot/data/GPT-4_VS_Human_translators (1).pdf', 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-07-08T00:23:43+00:00', 'trapped': '', 'modDate': 'D:20240708002343Z', 'creationDate': 'D:20240708002343Z'}, page_content='GPT-4 vs. Human Translators: A Comprehensive Evaluation of\nTranslation Quality Across Languages, Domains, and Expertise Levels\nJianhao Yan1,2∗\nPingchuan Yan3∗\nYulong Chen4∗\nJudy Li5\nXianchao Zhu5\nYue Zhang2,6,\x001 Zhejiang University\n2 School of Engineering, Westlake University\n3 University College London\n4 University of Cambridge\n5 Lan-Bridge Group\n6 Institute of Advanced Technology, Westlake Institute for Advanced Study\nell

In [47]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-4.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [49]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")



In [None]:
def get_retriever(chunks: List[Document]):
    embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})


    bm25_retriever = BM25Retriever.from_documents(chunks)
    bm25_retriever.k = 4

    
    hybrid_retriever = EnsembleRetriever(
        retrievers=[retriever, bm25_retriever],
        weights=[0.5, 0.5]  
    )

    return hybrid_retriever
