In [None]:
!pip install \
    "pinecone" \
    "langchain-pinecone" \
    "langchain-openai" \
    "langchain-text-splitters" \
    "langchain"

In [None]:
import os
import time
import logging
from os import path, makedirs, walk
from uuid import uuid4
from typing import List, Optional
from transformers import pipeline
from PyPDF2 import PdfReader, PdfWriter

from pinecone import Pinecone, ServerlessSpec, Index
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_pinecone import PineconeEmbeddings, PineconeVectorStore
from langchain_openai import ChatOpenAI
from timy import timer
from llama_parse import LlamaParse


# Add a handler to output logs to stdout
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Default values
DEFAULT_CLOUD='aws'
DEFAULT_REGION='us-east-1'
DEFAULT_TOP_K=5
DEFAULT_NUM_WORKERS=5
DEFAULT_LANGUAGE='pt'
DEFAULT_RESULT_TYPE="markdown"
DEFAULT_PINECONE_MODEL_NAME='multilingual-e5-large'
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

def get_files_with_extension(folder_paths: List[str], extension: str) -> List[str]:
    files_with_extension = []
    
    for folder in folder_paths:
        for root, _, files in walk(folder):
            for file in files:
                if file.endswith(extension):
                    files_with_extension.append(path.join(root, file))

    return files_with_extension

def create_vectors(data, embeddings):
    return [
        {"id": d["id"], "values": e["values"], "metadata": {"text": d["text"]}}
        for d, e in zip(data, embeddings)
    ]

def safe_execution(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            logger.error(f"Error in {func.__name__}: {e}")
            return None
    return wrapper

class BaseConfig:
    def __init__(self, api_key: str, **kwargs):
        """
        Base configuration class to handle common API key and extra parameters.
        """
        self.api_key = api_key
        for key, value in kwargs.items():
            setattr(self, key, value)


class PineconeConfig(BaseConfig):
    def __init__(
        self, 
        api_key: str, 
        model_name: str = DEFAULT_PINECONE_MODEL_NAME, 
        cloud: str = DEFAULT_CLOUD, 
        region: str = DEFAULT_REGION
    ):
        super().__init__(api_key=api_key, model_name=model_name, cloud=cloud, region=region)


class ChatOpenAIConfig(BaseConfig):
    def __init__(
        self, 
        api_key: str, 
        temperature: float = 0.0, 
        model_name: str = DEFAULT_OPENAI_MODEL
    ):
        super().__init__(api_key=api_key, temperature=temperature, model_name=model_name)


class LlamaConfig(BaseConfig):
    def __init__(
        self, 
        api_key: str,
        result_type: str = DEFAULT_RESULT_TYPE,
        num_workers: int = DEFAULT_NUM_WORKERS,
        verbose: bool = True,
        language: str = DEFAULT_LANGUAGE
    ):
        super().__init__(api_key=api_key, result_type=result_type, num_workers=num_workers, verbose=verbose, language=language)


class ConfigFactory:
    @staticmethod
    def create_pinecone_config(api_key: Optional[str] = None) -> PineconeConfig:
        api_key = api_key or os.getenv('PINECONE_API_KEY')
        return PineconeConfig(api_key=api_key)
    
    @staticmethod
    def create_chat_openai_config(api_key: Optional[str] = None) -> ChatOpenAIConfig:
        api_key = api_key or os.getenv('OPENAI_API_KEY')
        return ChatOpenAIConfig(api_key=api_key)
    
    @staticmethod
    def create_llama_config(api_key: Optional[str] = None) -> LlamaConfig:
        api_key = api_key or os.getenv('LLAMA_API_KEY')
        return LlamaConfig(api_key=api_key)


def basename_without_extension(filepath: str): 
    return path.splitext(path.basename(filepath))[0]

def split_pdf_in_chunks(input_pdf_path: str, output_folder: str, pages_per_chunk: int):
    # Ensure the output folder exists
    makedirs(output_folder, exist_ok=True)

    basename=basename_without_extension(input_pdf_path)
    
    # Load the PDF
    pdf_reader = PdfReader(input_pdf_path)
    total_pages = len(pdf_reader.pages)

    # Process the PDF in chunks
    output_paths=[]
    for start_page in range(0, total_pages, pages_per_chunk):
        pdf_writer = PdfWriter()
        end_page = min(start_page + pages_per_chunk, total_pages)

        # Add specified number of pages to the writer
        for page_num in range(start_page, end_page):
            page=pdf_reader.pages[page_num]
            pdf_writer.add_page(page)

        # Define the output path for each chunk
        chunk_number = start_page // pages_per_chunk + 1
        filename = f'{basename}_chunk_{chunk_number}.pdf'
        output_path = path.join(output_folder, filename)
        
        # Save the chunk as a PDF
        with open(output_path, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)

        output_paths.append(output_path)

    return output_paths

class PDFToMarkdownParser:
    def __init__(self, llama_config: LlamaConfig):
        self.llama_config=llama_config
        self._llama_parser=LlamaParse(
            api_key=llama_config.api_key,
            result_type=llama_config.result_type,
            num_workers=llama_config.num_workers,
            verbose=llama_config.verbose,
            language=llama_config.language,
        )

    @timer()
    def parse(self, filename: str):
        """Parses PDF content to Markdown."""
        qa_model = pipeline("question-answering")
        embedder = pipeline("feature-extraction")
        parsed_data = self._llama_parser.load_data(filename)
        print(f"Parsing of data {filename} is ready!")
        return parsed_data

class MarkdownComprehender:
    def __init__(
        self, 
        pinecone_config: PineconeConfig,
        chat_openai_config: ChatOpenAIConfig
    ):
        # Pinecone references
        self.pinecone_config = pinecone_config
        self.chat_openai_config = chat_openai_config
        self._pinecone = Pinecone(api_key=pinecone_config.api_key)
        self._embeddings = PineconeEmbeddings(
            model=pinecone_config.model_name, 
            pinecone_api_key=pinecone_config.api_key
        )
        self.llm = ChatOpenAI(
            openai_api_key=self.chat_openai_config.api_key, 
            model_name=self.chat_openai_config.model_name, 
            temperature=self.chat_openai_config.temperature
        )
        self.combine_docs_chain = create_stuff_documents_chain(self.llm, retrieval_qa_chat_prompt)

    # Pinecone index
    def _get_pinecone_spec(self):
        return ServerlessSpec(
            cloud=self.pinecone_config.cloud, 
            region=self.pinecone_config.region
        )

    @safe_execution
    def _get_index(self, index_name: str) -> Optional[Index]:
        if index_name not in self._pinecone.list_indexes().names():
            self._pinecone.create_index(
                name=index_name, 
                dimension=self._embeddings.dimension, 
                metric="cosine",
                spec=self._get_pinecone_spec()
            )
            while not self._pinecone.describe_index(index_name).status['ready']:
                time.sleep(1)
        return self._pinecone.Index(index_name)

    def create_embeddings(self, texts: List[str]):
        return self._pinecone.inference.embed(
            model=self.pinecone_config.model_name, inputs=texts, parameters={"input_type": "passage"}
        )
    
    def markdown_to_document(self, markdown_text: str):
        headers_to_split_on = [ ("##", "Header 2") ]

        markdown_splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=headers_to_split_on, strip_headers=False
        )
        return markdown_splitter.split_text(markdown_text)[0]

    @timer()
    def upsert(self, namespace: str, index_name: str, texts: List[str]):
        data = [{"id": str(uuid4()), "text": text} for text in texts]
        embeddings = self.create_embeddings([d["text"] for d in data])
        vectors = create_vectors(data, embeddings)
        index = self._get_index(index_name)
        return index.upsert(vectors=vectors, namespace=namespace)


    @timer()
    def query(self, namespace: str, index_name: str, query_str: str):
        """Queries Pinecone index and retrieves relevant documents."""
        index = self._get_index(index_name)
        doc_search = PineconeVectorStore(index_name=index_name, embedding=self._embeddings, namespace=namespace)
        retriever = doc_search.as_retriever()
        retrieval_chain = create_retrieval_chain(retriever, self.combine_docs_chain)
        return retrieval_chain.invoke({"input": query_str})


class Chat:
    def __init__(self, pinecone_config, llama_config, openai_config, index_name, namespace):
        """
        Initialize the Chat class with configuration details for Pinecone, Llama, and OpenAI.

        :param pinecone_config: Pinecone configuration object
        :param llama_config: Llama configuration object
        :param openai_config: OpenAI configuration object
        :param index_name: Name of the Pinecone index
        :param namespace: Namespace in the vector database
        """
        self.pinecone_config = pinecone_config
        self.llama_config = llama_config
        self.openai_config = openai_config
        self.index_name = index_name
        self.namespace = namespace
        self.comprehender = MarkdownComprehender(pinecone_config, openai_config)
        self.pdf_parser = PDFToMarkdownParser(llama_config)

    def parse_document(self, filename: str) -> List[str]:
        """
        Parse a PDF document and extract text chunks.

        :param filename: Path to the PDF file
        :return: List of text chunks
        """
        documents = self.pdf_parser.parse(filename)
        texts = [doc.text for doc in documents]
        return texts

    def upsert_texts(self, texts: List[str]) -> None:
        """
        Upsert text chunks into the vector database.

        :param texts: List of text chunks
        """
        self.comprehender.upsert(self.namespace, self.index_name, texts)

    def query(self, system_query: str, human_query: str) -> str:
        """
        Query the model with the given template and parameters.

        :param query_template: Query template string
        :param codigo: Error code
        :param categoria: Error category
        :param descricao: Error description
        :return: Tuple (uninformed response, informed response)
        """
        messages = [("human", human_query), ("system", system_query)]

        uninformed_query = self.comprehender.llm.invoke(messages)
        informed_query = self.comprehender.query(self.namespace, self.index_name, human_query)

        return {
            "uninformed": uninformed_query,
            "informed": informed_query
        }

