In [1]:
import openai
import pymupdf4llm
import pymupdf.pro
import langchain
from langchain.text_splitter import MarkdownTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import OpenAIEmbeddings
from enum import Enum
from dotenv import load_dotenv
from openai import OpenAI
import os
import chromadb
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from uuid import uuid4

load_dotenv()

True

In [2]:
path = "./uploads/ISLP.pdf"
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)

In [3]:
client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")


In [4]:
loader = PyMuPDFLoader(path)
text = loader.load()

In [5]:
_text = str(' '.join(map(lambda x: x.page_content, text)))

In [6]:
text = Document(page_content=_text)

In [7]:
text



In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=0, add_start_index=True)
chunks = text_splitter.split_documents([text])

In [9]:
vector_store = Chroma(collection_name='islp', embedding_function=OpenAIEmbeddings(model='text-embedding-3-small'), persist_directory='./chromadb', create_collection_if_not_exists=True)

In [10]:
existing_docs = vector_store.similarity_search("", k=1) 

if not existing_docs:
    uuids = [str(uuid4()) for _ in range(len(chunks))]
    vector_store.add_documents(documents=chunks, ids=uuids)
    print("Documents added to vector store")
else:
    print("Documents already exist in vector store")

Documents already exist in vector store


In [11]:
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "fetch_k": 10, "lambda_mult": 0.25}
)

In [12]:
prompt = '''
You are a formatting and organization assistant.

Your job is to take the raw information retrieved by a RAG system (provided below) and process it to create a clear, well-structured, and logically ordered context. This context will be used by another model to answer a user query, so you must not answer the query yourself.

Instructions:
- Organize the information into sections or bullet points.
- Remove duplicates and irrelevant or conflicting data.
- Preserve technical or factual accuracy.
- Do not fabricate or infer missing information.
- Make the result easy for another model to read and use as direct context.
- Ensure all the information from the retrieved chunks is present in the 

Below is the raw retrieved data:
---
{retrieved_chunks_here}
---

Return only the cleaned and structured context below.

Context:
'''

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", prompt),
    ]
)

In [13]:
retreived_chunks = retriever.invoke("What is PCA?")

In [14]:
prompt = prompt.format(retrieved_chunks_here=retreived_chunks)

In [15]:
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": prompt}    
        ],
    stream=False
)

In [16]:
print(response.choices[0].message.content)

### **Principal Components Analysis (PCA) Overview**  
- **Definition**: PCA is an unsupervised learning technique used to summarize a large set of correlated variables into a smaller number of representative variables (principal components) that explain most of the variability in the original data.  
- **Purpose**:  
  - Data visualization (observations or variables).  
  - Data imputation (filling missing values).  
  - Derived variables for supervised learning.  

### **Key Concepts in PCA**  
1. **Principal Components**:  
   - Directions in feature space where the original data show high variability.  
   - Represent lines/subspaces closest to the data cloud.  
   - Computed via eigenvalue decomposition of the covariance matrix.  

2. **Low-Dimensional Representation**:  
   - PCA finds a low-dimensional approximation of the data that retains maximal variance.  
   - Useful for visualizing high-dimensional data (e.g., reducing 3D data to 2D).  

3. **Standardization**:  
   - Vari