In [None]:
import os
import faiss
import numpy as np
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS as LangchainFAISS
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI

# Set your Azure OpenAI credentials
os.environ["AZURE_OPENAI_API_KEY"] = "your_azure_openai_api_key"
os.environ["AZURE_OPENAI_ENDPOINT"] = "your_azure_openai_endpoint"

def get_azure_openai_client():
    from openai import AzureOpenAI
    client = AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
    )
    return client

def get_embedding(text):
    openai = get_azure_openai_client()
    embedding = openai.embeddings.create(input=text, model="text-embedding-ada-002").data[0].embedding
    return embedding

def get_file_hash(file_path):
    import hashlib
    with open(file_path, "rb") as file:
        file_hash = hashlib.md5(file.read()).hexdigest()
    return file_hash

def process_pdf(file_path):
    file_hash = get_file_hash(file_path)
    cache_dir = Path(f"./cache/{file_hash}")
    faiss_index_path = cache_dir / "index.faiss"

    if faiss_index_path.exists():
        print(f"Loading from cache: {cache_dir}")
        try:
            vectorstore = LangchainFAISS.load_local(str(cache_dir), get_embedding)
            return vectorstore
        except Exception as e:
            print(f"Error loading cached index: {str(e)}. Reprocessing PDF.")
    else:
        print("Cache not found. Processing new PDF.")

    try:
        # Load and split the PDF
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )

        texts = text_splitter.split_documents(pages)

        # Generate embeddings
        embeddings = [get_embedding(text.page_content) for text in texts]
        embeddings_array = np.array(embeddings).astype('float32')
        
        # Create IVF-Flat index
        dimension = len(embeddings[0])
        nlist = min(int(4 * np.sqrt(len(embeddings))), 1024)  # number of clusters, capped at 1024
        quantizer = faiss.IndexFlatL2(dimension)
        index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
        
        # Train and add vectors to the index
        if len(embeddings) < nlist:
            print(f"Warning: number of vectors ({len(embeddings)}) is less than nlist ({nlist}). Setting nlist to {len(embeddings)}.")
            nlist = len(embeddings)
            index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

        index.train(embeddings_array)
        index.add(embeddings_array)

        # Create FAISS vectorstore
        vectorstore = LangchainFAISS(
            embedding_function=get_embedding,
            index=index,
            docstore=dict(zip(range(len(texts)), texts))
        )

        # Save the vectorstore
        cache_dir.mkdir(parents=True, exist_ok=True)
        vectorstore.save_local(str(cache_dir))

        print(f"Vectorstore saved to {cache_dir}")
        
        return vectorstore

    except Exception as e:
        print(f"An error occurred while processing the PDF: {str(e)}")
        return None

def ask_question(question, vectorstore):
    llm = AzureChatOpenAI(
        deployment_name=llm.GPT_4_OMNI_MODEL,
        openai_api_version="2023-07-01-preview",
        openai_api_key=openai.api_key,
        openai_api_type=openai.api_type,
        max_tokens=300,
        temperature=0.0
    )
    
    # Set the number of clusters to search
    vectorstore.index.nprobe = 10
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    )
    response = qa_chain({"query": question})
    return response["result"]

# Process the PDF
pdf_path = '/path/to/your/large/pdf/file.pdf'
vectorstore = process_pdf(pdf_path)

if vectorstore:
    print("PDF processed successfully!")
    
    # Ask questions
    while True:
        question = input("Ask a question about the PDF (or type 'quit' to exit): ")
        if question.lower() == 'quit':
            break
        answer = ask_question(question, vectorstore)
        print(f"Answer: {answer}\n")
else:
    print("Failed to process the PDF. Please check the file path and try again.")

In [None]:
vectorstore = LangchainFAISS(
    embedding_function=get_embedding,
    index=index,
    docstore=dict(zip(range(len(texts)), texts))
)

In [None]:
#GBESE

In [None]:
import os
import fitz  # PyMuPDF
import numpy as np
from tqdm import tqdm
from pathlib import Path
import hashlib
import pickle
import logging
from quanthub.util import llm

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def get_file_hash(file_path):
    with open(file_path, "rb") as file:
        file_hash = hashlib.md5(file.read()).hexdigest()
    return file_hash

def extract_text_from_page(page):
    """Extract text from a PDF page, handling complex layouts."""
    text = page.get_text("text")
    if not text.strip():
        # If no text is extracted, try to extract it as blocks
        blocks = page.get_text("blocks")
        text = "\n".join([b[4] for b in blocks])
    return text.strip()

def process_pdf(pdf_path, openai_client, batch_size=10):
    file_hash = get_file_hash(pdf_path)
    cache_dir = Path(f"./cache/{file_hash}")
    cache_dir.mkdir(parents=True, exist_ok=True)
    embeddings_file = cache_dir / "page_embeddings.pkl"

    if embeddings_file.exists():
        logger.info(f"Loading embeddings from cache: {embeddings_file}")
        with open(embeddings_file, "rb") as f:
            return pickle.load(f)

    logger.info("Processing PDF and creating embeddings...")
    page_embeddings = {}

    try:
        doc = fitz.open(pdf_path)
        total_pages = len(doc)

        for i in tqdm(range(0, total_pages, batch_size), desc="Processing pages"):
            batch_texts = []
            batch_pages = []

            for page_num in range(i, min(i + batch_size, total_pages)):
                page = doc[page_num]
                text = extract_text_from_page(page)
                if text:
                    batch_texts.append(text)
                    batch_pages.append(page_num)

            if batch_texts:
                embeddings = openai_client.embeddings.create(
                    input=batch_texts,
                    model="text-embedding-ada-002"
                )
                
                for page_num, emb in zip(batch_pages, embeddings.data):
                    page_embeddings[page_num] = np.array(emb.embedding)

        doc.close()

        # Save embeddings to disk
        with open(embeddings_file, "wb") as f:
            pickle.dump(page_embeddings, f)

        logger.info(f"Embeddings saved to {embeddings_file}")
        return page_embeddings

    except Exception as e:
        logger.error(f"An error occurred while processing the PDF: {str(e)}")
        return None

# Main execution
pdf_path = '/path/to/your/large.pdf'

# Initialize your custom GPT client
openai = llm.get_llm_client(llm.GPT_35_16K_MODEL)

# Process the PDF and get page embeddings
page_embeddings = process_pdf(pdf_path, openai)

if page_embeddings:
    print(f"Successfully processed {len(page_embeddings)} pages.")
else:
    print("Failed to process the PDF.")

In [None]:
from quanthub.util import llm # this is the openai model I use from within my network 
from langchain_openai import AzureChatOpenAI

openai = llm.get_llm_client()

def ask_question(question, vectorstore):
    llm_gpt = AzureChatOpenAI(
        deployment_name=llm.GPT_4_OMNI_MODEL,
        openai_api_version="2023-07-01-preview",
        openai_api_key=openai.api_key,
        openai_api_type=openai.api_type,
        max_tokens=300,
        temperature=0.0
    )
  
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm_gpt,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 10}),
    )
    response = qa_chain.invoke({"query": question})
    return response["result"]


def get_answer(question):
    if vectorstore:
        answer = ask_question(question, vectorstore)
        print(f"Answer: {answer}")
    else:
        print("PDF not processed successfully. Please check the file path and try again")

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import pandas as pd
import plotly.express as px

# ... (keep the existing imports and data preparation code)

@app.callback(
    [Output('revenue-chart', 'figure'),
     Output('percentage-chart', 'figure')],
    [Input('market-dropdown', 'value'),
     Input('security-dropdown', 'value')]
)
def update_charts(market, securities):
    # ... (keep the existing code for creating traces)

    legend_settings = dict(
        orientation='v',
        yanchor='top',
        y=1,
        xanchor='right',
        x=-0.05,
        bgcolor='rgba(255,255,255,0.8)',
        bordercolor='rgba(0,0,0,0.1)',
        borderwidth=1
    )

    # Common layout settings
    common_layout = dict(
        showlegend=True,
        hovermode='closest',
        margin=dict(l=80, r=50, t=50, b=50),
        xaxis=dict(
            showline=True,
            showgrid=True,
            gridcolor='lightgray',
            linecolor='black',
            linewidth=2,
            ticks='outside',
            tickfont=dict(family='Arial', size=12),
        ),
        yaxis=dict(
            showline=True,
            showgrid=True,
            gridcolor='lightgray',
            linecolor='black',
            linewidth=2,
            ticks='outside',
            tickfont=dict(family='Arial', size=12),
        ),
        plot_bgcolor='white',
        shapes=[
            dict(
                type="rect",
                xref="paper",
                yref="paper",
                x0=0,
                y0=0,
                x1=1,
                y1=1,
                line=dict(color="black", width=2),
                fillcolor="rgba(0,0,0,0)",
            )
        ]
    )

    revenue_layout = go.Layout(
        title=dict(
            text=f'Revenue Exposure - {market}',
            font=dict(family='Arial', size=24)
        ),
        yaxis=dict(title='Revenue'),
        xaxis=dict(title='Dates'),
        legend=legend_settings,
        **common_layout
    )
    
    percentage_layout = go.Layout(
        title=dict(
            text=f'Percentage Revenue Exposure - {market}',
            font=dict(family='Arial', size=24)
        ),
        yaxis=dict(title='Percentage of Revenue'),
        xaxis=dict(title='Dates'),
        legend=legend_settings,
        **common_layout
    )
    
    return {'data': revenue_traces, 'layout': revenue_layout}, {'data': percentage_traces, 'layout': percentage_layout}

# ... (keep the rest of the code unchanged)