In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pymupdf qdrant-client[fastembed] langchain-google-genai

Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.9-py3-none-any.whl.metadata (3.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading langchain_google_genai-2.0.9-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Installing collected packages: filetype, langchain-google-genai
Successfully installed filetype-1.2.0 langchain-google-genai-2.0.9


In [None]:
import os
import re
import fitz  # PyMuPDF
import uuid
from tqdm import tqdm
from typing import Optional, Dict, List
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding, SparseTextEmbedding
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Preprocess Data

In [None]:
def preprocess_pdf_text(pdf_path: str) -> str:
    """
    Extracts and preprocesses text from a PDF file, maintaining continuity across pages.

    Args:
    pdf_path (str): Path to the PDF file.

    Returns:
    str: Preprocessed text from the PDF with maintained continuity.
    """
    try:
        doc = fitz.open(pdf_path)
        full_text = []

        for page in doc:
            text = page.get_text()

            # Remove page numbers and headers/footers
            text = re.sub(r'^\s*Page \d+\s*$', '', text, flags=re.MULTILINE)
            text = re.sub(r'^\s*-+\s*$', '', text, flags=re.MULTILINE)

            # Remove excessive whitespace
            text = re.sub(r'\s+', ' ', text).strip()

            full_text.append(text)

        # Join all pages' text
        continuous_text = ' '.join(full_text)

        # Fix hyphenation at end of lines/pages
        continuous_text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', continuous_text)

        return continuous_text.strip()

    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {str(e)}")
        return ""
    finally:
        if 'doc' in locals():
            doc.close()

In [None]:
res = preprocess_pdf_text('./drive/MyDrive/PDSA_Transcripts/Week 1/Classes and Objects.pdf')
res

"Programming, Data Structures and Algorithms using Python Professor Madhavan Mukund Class and Objects So, continuing with our discussion of slightly more exotic aspects of Python, let us look at classes and objects. (Refer Slide Time: 0:15) So, most often classes and objects arise in the context of what are called abstract data types. So, we have data types as we know, in Python, we have lists, we have dictionaries. And when we have a data type, we have certain permitted operations on these. For a list, for example, you can append to it, or you can combine two lists using plus you can concatenate them, with a dictionary, you can create a new entry with the key, you can update it, and so on. You can get X, extract all the keys of a dictionary, extract all the values and so on. Now, sometimes we need to create our own data type. And this data type will typically have two parts; it will have some information that is stored in it. But there may also be some discipline or some required way 

In [None]:
def chunk_text(text: str, max_chunk_size: int = 1000) -> List[Dict[str, str]]:
    """
    Chunks a large text into smaller segments, ensuring sentences are not cut off.

    Args:
    text (str): The input text to be chunked.
    max_chunk_size (int): The maximum size of each chunk (default: 1000 characters).

    Returns:
    List[Dict[str, str]]: A list of dictionaries, each containing a chunk of text and its index.
    """
    # Split the text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    chunk_index = 0

    for sentence in sentences:
        # If adding this sentence would exceed the max chunk size and we already have content,
        # start a new chunk
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            chunks.append({
                "chunk_index": chunk_index,
                "text": current_chunk.strip()
            })
            chunk_index += 1
            current_chunk = ""

        # Add the sentence to the current chunk
        current_chunk += " " + sentence

        # If the current chunk is now longer than max_chunk_size, add it to chunks
        # This handles cases where a single sentence is longer than max_chunk_size
        if len(current_chunk) >= max_chunk_size:
            chunks.append({
                "chunk_index": chunk_index,
                "text": current_chunk.strip()
            })
            chunk_index += 1
            current_chunk = ""

    # Add any remaining text as the last chunk
    if current_chunk:
        chunks.append({
            "chunk_index": chunk_index,
            "text": current_chunk.strip()
        })

    return chunks

In [None]:
chunks = chunk_text(res,2000)
chunks

[{'chunk_index': 0,
  'text': 'Programming, Data Structures and Algorithms using Python Professor Madhavan Mukund Class and Objects So, continuing with our discussion of slightly more exotic aspects of Python, let us look at classes and objects. (Refer Slide Time: 0:15) So, most often classes and objects arise in the context of what are called abstract data types. So, we have data types as we know, in Python, we have lists, we have dictionaries. And when we have a data type, we have certain permitted operations on these. For a list, for example, you can append to it, or you can combine two lists using plus you can concatenate them, with a dictionary, you can create a new entry with the key, you can update it, and so on. You can get X, extract all the keys of a dictionary, extract all the values and so on. Now, sometimes we need to create our own data type. And this data type will typically have two parts; it will have some information that is stored in it. But there may also be some di

### Prepare Dataset

In [None]:
dataset = []
MAX_CHUNK_SIZE = 2000


total_pdfs = sum(len([f for f in os.listdir(f'./drive/MyDrive/PDSA_Transcripts/Week {i}/') if f.endswith('.pdf')]) for i in range(1, 12))

with tqdm(total=total_pdfs, desc="Processing PDFs") as pbar:
    for i in range(1, 12):
        PDF_DIRECTORY = f'./drive/MyDrive/PDSA_Transcripts/Week {i}/'

        for filename in os.listdir(PDF_DIRECTORY):
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(PDF_DIRECTORY, filename)
                pdf_content = preprocess_pdf_text(pdf_path)
                chunks = chunk_text(pdf_content,max_chunk_size=MAX_CHUNK_SIZE)

                for chunk in chunks:
                    dataset.append(
                        {
                            'id': str(uuid.uuid4()),
                            'metadata': {
                                'chunk_idx': chunk['chunk_index'],
                                'week': i
                            },
                            'source': filename,
                            'content': chunk['text']
                        }
                    )
                pbar.update(1)

Processing PDFs: 100%|██████████| 80/80 [02:56<00:00,  2.21s/it]


In [None]:
dataset

[{'id': '62097995-61b8-4c82-9775-0559bdbe7076',
  'metadata': {'chunk_idx': 0, 'week': 1},
  'source': 'Classes and Objects.pdf',
  'content': 'Programming, Data Structures and Algorithms using Python Professor Madhavan Mukund Class and Objects So, continuing with our discussion of slightly more exotic aspects of Python, let us look at classes and objects. (Refer Slide Time: 0:15) So, most often classes and objects arise in the context of what are called abstract data types. So, we have data types as we know, in Python, we have lists, we have dictionaries. And when we have a data type, we have certain permitted operations on these. For a list, for example, you can append to it, or you can combine two lists using plus you can concatenate them, with a dictionary, you can create a new entry with the key, you can update it, and so on. You can get X, extract all the keys of a dictionary, extract all the values and so on. Now, sometimes we need to create our own data type. And this data type

## [Optionally] Load/Save Dataset

In [None]:
import pandas as pd
df = pd.read_csv('<csv_path_here>')
dataset = df.to_dict('records')

In [None]:
import pandas as pd
df = pd.DataFrame(dataset)
df.to_csv('dataset.csv')

# Setup Qdrant Collection

In [None]:
from google.colab import userdata

QDRANT_CLIENT_URL = userdata.get('QDRANT_CLIENT_URL')
QDRANT_CLIENT_API_KEY = userdata.get('QDRANT_CLIENT_API_KEY')
COLLECTION_NAME = "PDSA_Transcripts_All_Google"

In [None]:
dense_model_name = "models/text-embedding-004"
embedding_model = GoogleGenerativeAIEmbeddings(model=dense_model_name,google_api_key=userdata.get('GOOGLE_API_KEY'))
dense_embeddings = list(embedding_model.embed_documents([dataset[0]["content"]]))
len(dense_embeddings[0])

768

In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(QDRANT_CLIENT_URL, api_key=QDRANT_CLIENT_API_KEY, timeout=600)
client.create_collection(
    COLLECTION_NAME,
    vectors_config={
        dense_model_name: models.VectorParams(
            size=len(dense_embeddings[0]),
            distance=models.Distance.COSINE,
        )
    }
)

True

# Push To Collection

In [None]:
import time
import functools
from tqdm import tqdm
from collections import deque

# Google API limits
RATE_LIMIT = 150  # Max requests per minute
BATCH_SIZE = 10    # Smaller batch to avoid hitting limit

# Request tracker (timestamps of last 150 requests)
request_timestamps = deque(maxlen=RATE_LIMIT)

def enforce_rate_limit():
    """Ensure we don't exceed 150 requests per minute."""
    while len(request_timestamps) >= RATE_LIMIT:
        elapsed = time.time() - request_timestamps[0]  # Oldest request
        if elapsed < 60:  # Wait if within a minute
            time.sleep(60 - elapsed)
        else:
            break
    request_timestamps.append(time.time())

def retry(exceptions, tries=3, delay=2, backoff=2):
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            attempt = 0
            wait_time = delay
            while attempt < tries:
                try:
                    enforce_rate_limit()  # Apply rate limit before calling API
                    return func(*args, **kwargs)
                except exceptions as e:
                    if "RATE_LIMIT_EXCEEDED" in str(e) or "429" in str(e):
                        print("Rate limit exceeded. Waiting before retry...")
                        time.sleep(30)  # Cooldown for quota limit
                    else:
                        time.sleep(wait_time)  # Normal retry delay
                    attempt += 1
                    wait_time *= backoff
                    if attempt == tries:
                        raise
        return wrapper
    return decorator

@retry((Exception,), tries=3, delay=2, backoff=2)
def create_embedding(data):
    return embedding_model.embed_documents([data['content']])

# Prepare batches
batches = [dataset[i:i + BATCH_SIZE] for i in range(0, len(dataset), BATCH_SIZE)]

# Initialize tqdm progress bar
with tqdm(total=len(dataset), desc="Uploading points") as pbar:
    for batch in batches:
        points = []
        for data in batch:
            dense_embeddings = create_embedding(data)
            points.append(
                models.PointStruct(
                    id=data["id"],
                    vector={dense_model_name: list(dense_embeddings)[0]},
                    payload=data
                )
            )

        # Upload batch
        retry_count = 0
        while retry_count < 3:
            try:
                enforce_rate_limit()  # Rate limit before uploading
                client.upload_points(COLLECTION_NAME, points=points)
                break
            except Exception as e:
                if "RATE_LIMIT_EXCEEDED" in str(e) or "429" in str(e):
                    print("Rate limit exceeded. Cooling down...")
                    time.sleep(30)  # Cooldown
                else:
                    print(f"Error uploading batch: {e}")
                    time.sleep(5)
                retry_count += 1

        # Update progress bar
        pbar.update(len(batch))

print("Upload complete!")

Uploading points: 100%|██████████| 856/856 [06:08<00:00,  2.32it/s]

Upload complete!





# Hybrid Search

In [None]:
query = "What is the difference between objects and classes?"

In [None]:
dense_query_vector = embedding_model.embed_query(query)
prefetch = [
    models.Prefetch(
        query=dense_query_vector,
        using=dense_model_name,
        limit=20,
    ),
]
results = client.query_points(
    COLLECTION_NAME,
    prefetch=prefetch,
    query=models.FusionQuery(
        fusion=models.Fusion.RRF,
    ),
    with_payload=True,
    limit=10,
)

In [None]:
results = client.query_points(
    COLLECTION_NAME,
    using=dense_model_name,
    query=dense_query_vector,
    with_payload=True,
    limit=10,
)

In [None]:
results.points[0].payload["content"]

'Programming, Data Structures and Algorithms using Python Professor Madhavan Mukund Class and Objects So, continuing with our discussion of slightly more exotic aspects of Python, let us look at classes and objects. (Refer Slide Time: 0:15) So, most often classes and objects arise in the context of what are called abstract data types. So, we have data types as we know, in Python, we have lists, we have dictionaries. And when we have a data type, we have certain permitted operations on these. For a list, for example, you can append to it, or you can combine two lists using plus you can concatenate them, with a dictionary, you can create a new entry with the key, you can update it, and so on. You can get X, extract all the keys of a dictionary, extract all the values and so on. Now, sometimes we need to create our own data type. And this data type will typically have two parts; it will have some information that is stored in it. But there may also be some discipline or some required way 