In [1]:
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import time
import os
import json

In [2]:
load_dotenv()
access_token = os.environ.get("ACCESS_TOKEN")

In [3]:
SEPARATORS = [
    "\n\n",
    "\n",
    " ",
    "",
]

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    add_start_index=True,
    strip_whitespace=True,
    separators=SEPARATORS,
)

In [5]:
template = """
You are a factual language model trained to convert bodies of text into a single, well-formed question and its corresponding answer.
When processing the text, prioritize factual information and avoid making claims of sentience or consciousness.
Present the question and answer in a single JSON object. 
context: {context}
"""
prompt = PromptTemplate.from_template(template)

In [6]:
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    model_kwargs={'max_length': 128},
    temperature=0.5,
    huggingfacehub_api_token=access_token,
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/zeus/.cache/huggingface/token
Login successful


In [7]:
llm_chain = prompt | llm

In [8]:
def is_json(data):
    try:
        json.loads(data)
        return True
    except ValueError:
        return False

In [9]:
def get_pdf_list(path):
    files = []
    if os.path.exists(path):
        files.extend(os.listdir(path))
    return files

In [10]:
def get_text(pdf_path):
    path = "./pdfs/A_Random_Walk_Down_Wall_Street.pdf"
    reader_obj = open(path, "rb")
    pdf = PdfReader(reader_obj)
    book = [Document(page_content=page.extract_text()) for page in pdf.pages]
    reader_obj.close()
    return book

In [19]:
outputs = []
files = get_pdf_list("./pdfs")
for pdf in files:
    pdf_path = "./pdfs/" + pdf
    book = get_text(pdf_path)
    proccessed_docs = []
    for page in book:
        proccessed_docs += text_splitter.split_documents([page])
    for chunk in proccessed_docs:
        output = llm_chain.invoke({"context": chunk})
        if is_json(output):
            outputs.append(output)
        time.sleep(12)

In [None]:
with open('outputs.json', 'w') as f:
    json.dump(outputs, f)