In [52]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from io import BytesIO
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance
import re

# Initialize the model for vectorizing text
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text.append(page.get_text("text"))
        # Check for images in the page
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(BytesIO(image_bytes))
            # Use pytesseract to do OCR on the image
            text.append(pytesseract.image_to_string(image))
    return "\n".join(text)

# Function to split text into sentences
def split_text_into_sentences(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return sentences

# Function to vectorize the text
def vectorize_text(sentences):
    return model.encode(sentences)

# Function to store vectors in Qdrant
def store_vectors_in_qdrant(vectors, texts, qdrant_client, collection_name):
    points = [
        PointStruct(
            id=i,
            vector=vector.tolist(),
            payload={"text": text}
        )
        for i, (vector, text) in enumerate(zip(vectors, texts))
    ]
    qdrant_client.upsert(
        collection_name=collection_name,
        points=points
    )

# Function to fetch vector from a given word input
def fetch_vector_from_word(word):
    vector = model.encode(word)
    return vector

# Function to fetch the closest word(s) from vector input
def fetch_word_from_vector(vector, qdrant_client, collection_name, top_k=1):
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=vector.tolist(),
        limit=top_k,
        with_payload=True
    )
    return [hit.payload["text"] for hit in search_result]

# Function to fetch vector directly from Qdrant using word input
def fetch_vector_from_qdrant_by_word(word, qdrant_client, collection_name, top_k=1):
    vector = fetch_vector_from_word(word)
    closest_words = fetch_word_from_vector(vector, qdrant_client, collection_name, top_k=top_k)
    return closest_words

# Main function to process the PDF and store vectors
def process_pdf(pdf_path, qdrant_client, collection_name):
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    sentences = split_text_into_sentences(text)
    
    # Vectorize the text
    vectors = vectorize_text(sentences)
    
    # Store vectors in Qdrant
    store_vectors_in_qdrant(vectors, sentences, qdrant_client, collection_name)

# Example usage
pdf_path = 'example.pdf'
collection_name = 'pdf_texts'

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url="https://22d49f19-31bc-4841-9696-77d665b462be.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="YYT-1bIz6NjE0ZGY0f0zMMQH3uJe-rrB8707Q7OLLhBcQC8ZBH7hNg",
)

# Create a collection in Qdrant (if it doesn't exist)
qdrant_client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)  # Adjust the size to match your vector dimensions
)

# Process the PDF and store vectors
process_pdf(pdf_path, qdrant_client, collection_name)



`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.


`recreate_collection` method is deprecated and will be removed in the future. Use `collection_exists` to check collection existence and `create_collection` instead.



In [53]:

word = "Venture"
closest_texts = fetch_vector_from_qdrant_by_word(word, qdrant_client, collection_name, top_k=5)
print(f"Closest text(s) in Qdrant to '{word}':")
for i in closest_texts:
    print(i)

Closest text(s) in Qdrant to 'Venture':
Additionally, participants 
have the opportunity to forge connections within the vibrant Venture 
Development Centre community, enabling them to explore avenues 
conducive to crafting a robust, customer-centric startup from inception.

• Certificate from Venture Development Center in collaboration with 
entrepreneurial bodies : Northeastern University-Center for Emerging 
Markets  (Boston, USA)  , Centrep-Malaysia , Tie Vizag, i-TBI, G-TEC 
• Access to advanced Maker Space, MURTI Lab and other facilities at 
GITAM.
Designed to cater to a diverse array of aspiring studentprenuers, this initiative 
welcomes individuals from various sectors who exhibit the drive and creativity 
to innovate, execute, and establish impactful ventures.
• Visit to AMTZ to gain exposure the bio -medical start-up ecosystem
• Extensive network of mentors for entrepreneurial ventures.
(Go To 
Market Strategy, Marketing Plan & Customer Acquisition) 
Day 7: Financial Manageme

In [54]:
# Work in Progress

import plotly.express as px
import pandas as pd

# Create a dummy vector with 384 dimensions
dummy_vector = [0] * 384

# Fetch all vectors from Qdrant
search_result = qdrant_client.search(
    collection_name=collection_name,
    query_vector=dummy_vector,
    limit=1000000,  # Set a high limit
    with_payload=True
)

# Extract vectors and texts from search result
vectors = []
texts = []
for hit in search_result:
    if hit.vector is not None:
        vectors.append(hit.vector)
        texts.append(hit.payload["text"])

print(texts)



[]
