In [1]:
# Install required packages
!pip install langchain langchain-community chromadb transformers torch sentence-transformers pypdf
!pip install huggingface_hub

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import necessary libraries
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
import torch
from langchain.embeddings import HuggingFaceEmbeddings
import os

In [3]:
# Data Ingestion from Google Drive
def load_pdfs_from_drive(directory_path):
    """Load and split PDF documents from a specified Google Drive directory."""
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
    return documents

# Specify the Google Drive directory containing PDFs
drive_pdf_directory = '/content/drive/MyDrive/pdfs'
documents = load_pdfs_from_drive(drive_pdf_directory)

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)

print(f"Loaded and split {len(split_documents)} document chunks.")

Loaded and split 74 document chunks.


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Model ID
model_id = "deepseek-ai/deepseek-llm-7b-chat"

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",  # Explicitly use T4 GPU
    offload_folder="/content/drive/MyDrive/quantum_tutor_offload",
    torch_dtype=torch.float16,  # Half-precision for T4 GPU
    low_cpu_mem_usage=True  # Minimize CPU memory usage
)

# Create generation pipeline
deepseek_llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    temperature=0.7,
    top_p=0.9
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Device set to use cuda


In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Load PDFs from Google Drive
def load_pdfs_from_drive(directory_path):
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
    return documents

# Specify PDF directory
drive_pdf_directory = '/content/drive/MyDrive/pdfs'
documents = load_pdfs_from_drive(drive_pdf_directory)

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)

print(f"Loaded and split {len(split_documents)} document chunks.")

Loaded and split 74 document chunks.


In [6]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import os

# Initialize lightweight embedding model for T4 GPU
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"batch_size": 32}  # Smaller batch for T4 GPU
)

# Create and persist ChromaDB vector store
db = Chroma.from_documents(
    documents=split_documents,
    embedding=embeddings,
    persist_directory="/content/drive/MyDrive/chroma_db"
)
db.persist()
print("Vector store created and persisted.")

  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store created and persisted.


  db.persist()


In [9]:
def rag_pipeline(query, db, llm, top_k=3):
    """Retrieve documents and generate a response."""
    docs = db.similarity_search(query, k=top_k)
    context = "\n".join([doc.page_content for doc in docs])
    prompt = f"""Answer the query based on the context. Be concise. If no info, say so.

Context:
{context}

Query: {query}

Answer:"""
    response = llm(prompt, return_full_text=False)[0]['generated_text']
    return response

# Test query
query = "Summarize this paper in breif?"
response = rag_pipeline(query, db, deepseek_llm)
print(f"Query: {query}\nResponse: {response}")

Query: Summarize this paper in breif?
Response:  This paper proposes a noise-tolerant variant of the trust region algorithm for unconstrained optimization. The variant uses a trust region size that increases with the magnitude of the gradient, and a penalty parameter that increases with the magnitude of the gradient norm. The paper provides a proof of convergence for the algorithm, and gives an analysis of its convergence rate. The paper also provides numerical examples to demonstrate the effectiveness of the algorithm.


In [10]:
# Test query
query = "What is the main motivation behind modifying classical trust region methods for noisy optimization?"
response = rag_pipeline(query, db, deepseek_llm)
print(f"Query: {query}\nResponse: {response}")

Query: What is the main motivation behind modifying classical trust region methods for noisy optimization?
Response:  The main motivation behind modifying classical trust region methods for noisy optimization is to handle problems where the objective function, gradient, and possibly the Hessian, are subject to bounded, non-diminishing noise. This modification allows the method to adapt to the presence of noise in the optimization process and still achieve strong global convergence properties and eﬀectiveness in practice. By incorporating the noise level into the algorithm, the trust region method can adapt to the increased uncertainty caused by the noise and still converge to an optimal solution.


In [11]:
# Test query
query = "Explain the key idea behind Algorithm 1. How does it differ from classical trust region methods?"
response = rag_pipeline(query, db, deepseek_llm)
print(f"Query: {query}\nResponse: {response}")

Query: Explain the key idea behind Algorithm 1. How does it differ from classical trust region methods?
Response:  The key idea behind Algorithm 1 is to use a line search method, which iteratively refines the search direction and trust region radius to optimize a constrained optimization problem. It differs from classical trust region methods by using a line search method instead of a fixed step size. In classical trust region methods, a fixed step size is used, which may not be optimal for the problem at hand. Algorithm 1, on the other hand, dynamically adjusts the search direction and trust region radius during the line search, which can lead to faster convergence and more accurate solutions.


In [12]:
# Test query
query = "What is a Cauchy step in the context of this paper, and why is it important for the convergence guarantees?"
response = rag_pipeline(query, db, deepseek_llm)
print(f"Query: {query}\nResponse: {response}")

Query: What is a Cauchy step in the context of this paper, and why is it important for the convergence guarantees?
Response:  In the context of this paper, a Cauchy step refers to the step size computed based on the Cauchy step formula (9) and (10). The Cauchy step is important for the convergence guarantees because it ensures that the iteration process eventually drives the algorithm towards regions where the stationarity measure is small (i.e., comparable to the noise level), which is a necessary condition for global convergence. The reduction in the model provided by the Cauchy step (11) guarantees that the algorithm does not miss the global optimum by taking excessively small steps.


In [13]:
# Test query
query = "What does the paper suggest about reducing noise adaptively during optimization??"
response = rag_pipeline(query, db, deepseek_llm)
print(f"Query: {query}\nResponse: {response}")

Query: What does the paper suggest about reducing noise adaptively during optimization??
Response:  The paper suggests reducing noise adaptively during optimization by running the trust region algorithm until it ceases to make significant progress. This is indicated by the algorithm not making substantial progress in the objective function and the accuracy in the function and gradient being increased. This process is repeated with the new value of ϵf in (7). This approach provides a disciplined way to achieve high accuracy in the solution using a noise-tolerant trust region algorithm.
