In [1]:
!pip install requests pymupdf openai langchain langchain_community faiss-cpu tiktoken > /dev/null

In [25]:
import fitz  # PyMuPDF
import openai
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain.chains import RetrievalQA
from google.colab import userdata
from langchain.chat_models import ChatOpenAI

# PDF 텍스트 추출 함수 (파일 경로 사용)
def extract_text_from_pdf(file_path):
    # PyMuPDF로 PDF 파일 열기
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()  # 텍스트 추출
    return text

## 2. RAG 모델 설정

In [26]:
# GPT-4 API 키 설정
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

# 텍스트 분할기 설정
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50)  # 텍스트 크기 및 중복 설정

# 로컬 PDF 파일 경로
pdf_file_path = '/content/2310.06825v1-1-6.pdf'  # 여기서 경로를 적절히 변경하십시오.

# PDF에서 텍스트 추출
document_text = extract_text_from_pdf(pdf_file_path)

In [27]:
# 텍스트를 작은 덩어리로 나누기
chunks = text_splitter.split_text(document_text)

# 각 텍스트 덩어리를 Document 객체로 래핑
documents = [Document(page_content=chunk) for chunk in chunks]

# 임베딩 및 FAISS 벡터화
embedding = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embedding)

In [28]:
# 최신 방식의 Chat 모델
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 2})
)

query = (
    "You are a helpful AI assistant tasked with summarizing a scientific paper. "
    "Summarize the main contributions, methods, and findings of the following paper "
    "in a concise paragraph."
)
summary = qa_chain.run(query)
print(summary)

The paper introduces Mistral 7B, a 7-billion-parameter language model designed for high performance and efficiency in natural language processing. It outperforms the best open 13B model (Llama 2) across all benchmarks and surpasses the best released 34B model (Llama 1) in reasoning, mathematics, and code generation. Mistral 7B employs innovative techniques such as grouped-query attention (GQA) for faster inference and sliding window attention (SWA) to handle longer sequences efficiently. The model is fine-tuned to follow instructions, resulting in Mistral 7B – Instruct, which outperforms Llama 2 13B in human and automated benchmarks. The findings highlight that Mistral 7B achieves high performance while maintaining efficiency, suggesting a new approach to model design that balances capability and computational cost.
