In [None]:
!pip install docx2txt torch transformers python-pptx Pillow
!pip install llama-index llama-index-llms-groq==0.1.3 groq==0.4.2 llama-index-embeddings-huggingface==0.2.0

####GEMINI FOR ALL MULTIMODEL DATA TEXT + IMAGES
!pip install llama-index-embeddings-gemini
!pip install 'google-generativeai>=0.3.0' matplotlib
!pip install llama-index-multi-modal-llms-gemini
!pip install llama-index-vector-stores-qdrant
!pip install llama-index-llms-gemini
!pip install llama-index-readers-file pymupdf

#######FOR IMAGES CLIP EMBEDDING
!pip install llama_hub
## VECTOR DB as CHROMADB
!pip install llama-index-vector-stores-qdrant
!pip install llama-index-multi-modal-llms-ollama
!pip install qdrant-client pdfminer.six python-docx python-pptx pandas

In [None]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader, Document,
    StorageContext,
    ServiceContext,
    load_index_from_storage
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.llms.groq import Groq
# import os
# from dotenv import load_dotenv
# load_dotenv()
import warnings
warnings.filterwarnings('ignore')

In [None]:
###########SET configuration & KEYS & GLOBAL VARIABLES
# Load configuration from config.json file
import json
with open('/content/config.json', 'r') as f:
    config = json.load(f)

#from google.colab import userdata
#GROQ_API_KEY = userdata.get('groq')

GROQ_API_KEY = config['groq_api_key']
import os
GOOGLE_API_KEY = "AIzaSyAuNDcCseSJqBAtOQ4e5mRqXWDHmbcduzs"  # add your GOOGLE API key here
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
####LOAD IMAGES AND CREATE DOCUMENT <TUPLE --> having embedding of images using CLIP> LIST
#######MULTI MODEL INDEX
#Create and Use Qdrant Vector Store
#Set up two collections in Qdrant: one for text embeddings and one for image embeddings.
import qdrant_client
from qdrant_client.models import VectorParams, Distance
from transformers import AutoTokenizer, AutoModel

# Define the schema for your collections
text_collection_name = "text_collection"
image_collection_name = "image_collection"
hf_text_model_name='sentence-transformers/all-MiniLM-L6-v2'
hf_image_model_name='openai/clip-vit-base-patch32'
hf_token=config['hf_token']
# Set up Qdrant client
client = qdrant_client.QdrantClient(path="/content/qdrant_storage1",)

#SAVE IMAGES EMBEDDING IN QDrANT

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import os
import uuid

from pdfminer.high_level import extract_text as extract_pdf_text
from docx import Document as DocxDocument
from pptx import Presentation
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client.http.models import PointStruct

hf_model_name='openai/clip-vit-base-patch32'
clip_processor = CLIPProcessor.from_pretrained(hf_model_name, use_auth_token=config['hf_token'])
clip_model = CLIPModel.from_pretrained(hf_model_name, use_auth_token=config['hf_token'])
image_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".gif"}
text_extensions = {".pdf", ".ppt", ".pptx", ".txt", ".rtf",".csv",".xlsx",".html",".md",".docx",".xls"}
all_text_doclistWithembeddings=[]
all_image_doclistWithembeddings=[]
history_file = "/content/load_history.json"
text_tokenizer = AutoTokenizer.from_pretrained(hf_text_model_name, use_auth_token=hf_token)
text_model = AutoModel.from_pretrained(hf_text_model_name, use_auth_token=hf_token)

# Initialize Qdrant vector stores
text_vector_store = QdrantVectorStore(
    client=client,
    collection_name=text_collection_name
)

image_vector_store = QdrantVectorStore(
    client=client,
    collection_name=image_collection_name
)

client.recreate_collection(
    collection_name=text_collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)
client.recreate_collection(
    collection_name=image_collection_name,
    vectors_config=VectorParams(size=512, distance=Distance.COSINE)
)

In [None]:


def load_history():
        if os.path.exists(history_file):
            with open(history_file, 'r') as f:
                return json.load(f)
        return {}

def save_history():
        with open(history_file, 'w') as f:
            json.dump(file_history, f)

def processimage_document(file_path):
        extension = os.path.splitext(file_path)[1].lower()
        if extension in image_extensions:
          if should_process(file_path):
            return embed_image(file_path)

def embed_image(file_path):
        # Open the image file
        image = Image.open(file_path)
        # Process the image using CLIP
        inputs = clip_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            embeddings = clip_model.get_image_features(**inputs)
        # Create a Document object
        print("reading ", file_path)
        document = SimpleDirectoryReader(input_files=[file_path]).load_data()
        # Attach embeddings to the document
        document[0].metadata['embeddings'] = embeddings
        document[0].metadata['id'] = str(uuid.uuid4())
         # Prepare Qdrant PointStruct
        point = PointStruct(
            id=document[0].metadata['id'],
            vector=document[0].metadata['embeddings'][0].tolist(),
            payload={"file_path": file_path}
        )
        client.upsert(
            collection_name=image_collection_name,
            points=[point]
        )
        print("Embedding saved for ", file_path)
        return document[0]

def extract_text_content( file_path):
        extension = os.path.splitext(file_path)[1].lower()
        text, images = "", []
        if extension == ".pdf":
            text, images = extract_pdf_content(file_path)
        elif extension == ".docx":
            text, images = extract_docx_content(file_path)
        elif extension == ".pptx":
            text, images = extract_pptx_content(file_path)
        elif extension == ".xlsx":
            text = extract_xlsx_content(file_path)
        print("reading ", file_path)
        document = SimpleDirectoryReader(input_files=[file_path]).load_data()
        print("prepared doc for ", file_path)
        type(document)
        len(document)

        document[0].metadata['embeddings'] = embed_text(text)
        document[0].metadata['id'] = str(uuid.uuid4())

        # Prepare Qdrant PointStruct
        point = PointStruct(
            id=document[0].metadata['id'],
            vector=document[0].metadata['embeddings'][0].tolist(),
            payload={"file_path": file_path, "text": text}
        )
        client.upsert(
            collection_name=text_collection_name,
            points=[point]
        )
        print("Embedding with 384 saved for ",file_path)
        return document[0]

def extract_pdf_content(file_path):
        text = extract_pdf_text(file_path)
        images = extract_images_from_pdf(file_path)
        return text, images

def extract_docx_content( file_path):
        doc = DocxDocument(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        images = extract_images_from_docx(file_path)
        return text, images

def extract_pptx_content(file_path):
        prs = Presentation(file_path)
        text = "\n".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
        images = extract_images_from_pptx(file_path)
        return text, images

def extract_xlsx_content(file_path):
        df = pd.read_excel(file_path)
        return df.to_string()

def extract_images_from_pdf(file_path):
        # Implement image extraction from PDF
        return []

def extract_images_from_docx(file_path):
        # Implement image extraction from DOCX
        return []

def extract_images_from_pptx(file_path):
        # Implement image extraction from PPTX
        return []

def embed_text(text):
        inputs = text_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = text_model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.numpy()

def should_process(file_path):
        if not os.path.exists(file_path):
            return False
        file_mod_time = os.path.getmtime(file_path)
        if file_path not in file_history or file_history[file_path] < file_mod_time:
            file_history[file_path] = file_mod_time
            return True
        return False

def parse_document( file_path):
      if  os.path.isfile(filepath):
        extension = os.path.splitext(file_path)[1].lower()
        if extension in text_extensions:
            return extract_text_content(file_path)


In [None]:
###EMBEDD IMAGES
## READ HISTORY TO AVOID PROCESSING SAME FILE AGAIN
file_history = load_history()

for filename in os.listdir(config['directories_images']):
            filepath = os.path.join(config['directories_images'], filename)
            if not os.path.isfile(filepath):
                continue  # Skip non-file items
            getimagedocumentWIthEmbedding = processimage_document(filepath)
            all_image_doclistWithembeddings.append(getimagedocumentWIthEmbedding)

##RECORD FILES ALREADY READ AND PROCESSED
save_history()
len(all_image_doclistWithembeddings)

In [None]:
## READ HISTORY TO AVOID PROCESSING SAME FILE AGAIN
file_history = load_history()
####PROCESSS OTHER THAN IMAGES FILES
all_text_doclistWithembeddings=[]
for dirfullpath in config['directories']:
            if not os.path.isdir(dirfullpath):
                print("Not a directory", dirfullpath)
                continue  # Skip non-dir items
            for filename in os.listdir(dirfullpath):
              extension = os.path.splitext(filename)[1].lower()
              if extension in text_extensions:
                if should_process(dirfullpath+"/"+filename):
                  print("processing file", filename)
                  all_text_doclistWithembeddings.append(parse_document(dirfullpath+"/"+filename))

save_history()
len(all_text_doclistWithembeddings)

In [None]:
### WITH EMBEDDINGS PREPARE NODES
from llama_index.core import Settings
from llama_index.core.node_parser import SemanticSplitterNodeParser

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)
nodes_with_embeddings = splitter.get_nodes_from_documents(all_text_doclistWithembeddings)
nodes_img_embedded = splitter.get_nodes_from_documents(all_image_doclistWithembeddings)

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank
# We choose a model with relatively high speed and decent accuracy.

#Integrate LlamaIndex for Multi-Model Vector Index
#Integrate LlamaIndex with the Qdrant vector store to create a multi-model vector index.
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.vector_stores.qdrant import QdrantVectorStore

# construct vector store and customize storage context
storage_context_images = StorageContext.from_defaults(
    vector_store=image_vector_store)

storage_context_text = StorageContext.from_defaults(
    vector_store=text_vector_store)

service_context_emb_text = ServiceContext.from_defaults(embed_model=HuggingFaceEmbedding(model_name=hf_text_model_name), llm=llm_groq_client)
service_context_emb_image = ServiceContext.from_defaults(embed_model=HuggingFaceEmbedding(model_name=hf_image_model_name), llm=clip_model)


# Initialize the query engine
index_path = "/content/embedded_path_storage/"

#emd_index = VectorStoreIndex.from_documents(all_text_doclistWithembeddings,chunk_size=4096,nodes_parsers=nodes_with_embeddings)

# build index
emd_text_index = VectorStoreIndex(nodes_with_embeddings, service_context=service_context_emb_text)
emd_images_index = VectorStoreIndex(nodes_img_embedded,service_context=service_context)
query_engine = emd_text_index.get_query_engine()

response = query_engine.query("describe the parquet header components as per documents provided only?")
print(response)

In [None]:
###READER
%pip install llama-index-readers-qdrant

In [None]:
from llama_index.readers.qdrant import QdrantReader

from qdrant_client import QdrantClient
from qdrant_client.http.models import ScoredPoint

try:
            client.close()
            print("Qdrant client closed successfully.")
except Exception as e:
            print(f"Error closing Qdrant client: {e}")
# Initialize Qdrant client pointing to the local storage
# Define QdrantReader using the correct initialization
reader = QdrantReader(path="/content/qdrant_storage1")
# NOTE: Required args are collection_name, query_vector.
# See the Python client: https://github.com/qdrant/qdrant_client
# for more details.
query_vector = [0] * 384

# Load data from Qdrant
documents = reader.load_data(
    collection_name="text_collection",
    query_vector=query_vector,
    limit=5
)

# Print loaded documents
for doc in documents:
    print(doc)

In [None]:
import pptx
print(f"pptx version: {pptx.__version__}")


pptx version: 0.6.23


In [None]:
import os
from docx import Document
from pptx import Presentation
import fitz  # PyMuPDF library for PDFs
from bs4 import BeautifulSoup  # For HTML parsing


def extract_images(filepath, output_folder):
  """
  Extracts images from various document formats and saves them with filename_descriptor.png format.

  Args:
      filepath (str): Path to the document file.
      output_folder (str): Path to the folder where images will be saved.
  """
  filename, ext = os.path.splitext(os.path.basename(filepath))
  os.makedirs(output_folder, exist_ok=True)

  if ext == '.docx':
    doc = Document(filepath)
    for i, image in enumerate(doc.inline_shapes.graphics):
      image.image.save(os.path.join(output_folder, f"{filename}_{i+1}.png"))

  elif ext == '.pptx':
    prs = Presentation(filepath)
    for slide_idx, slide in enumerate(prs.slides):
      for i, pic in enumerate(slide.shapes.pictures):
        if isinstance(shape, picture_type):
          pic.image.save(os.path.join(output_folder, f"{filename}_{slide_idx+1}_{i+1}.png"))

  elif ext == '.pdf':
    doc = fitz.open(filepath)
    for page_idx in range(len(doc)):
      page = doc.load_page(page_idx)
      images = page.get_images()
      for i, image in enumerate(images):
        # Extract image data and save as PNG
        if len(image) == 6:
          x0, y0, x1, y1, link, pix = image  # Unpack only if length is 6
          if pix:
            image_data = pix.as_png_string()
            with open(os.path.join(output_folder, f"{filename}_{page_idx+1}_{i+1}.png"), 'wb') as f:
              f.write(image_data)
        else:
          print(f"Unexpected data format for image: {image}")
  # Handle unsupported formats (xls, md, webp)
  elif ext in ('.xls', '.xlsx', '.md', '.webp'):
    print(f"Image extraction from '{ext}' format not directly supported yet.")

  else:
    print(f"File format '{ext}' not supported.")

# Example usage
output_folder = "/content/extracted_images"
for filename in os.listdir("/content/test_mix"):
            filepath = os.path.join("/content/test_mix", filename)
            print("processing ", filename)
            extract_images(filepath, output_folder)


processing  UserManual_JointDeclaration_member.pdf
Unexpected data format for image: (13, 0, 225, 225, 8, 'DeviceRGB', '', 'Image13', 'FlateDecode')
Unexpected data format for image: (38, 0, 1426, 640, 8, 'DeviceRGB', '', 'Image38', 'FlateDecode')
Unexpected data format for image: (39, 0, 1430, 545, 8, 'DeviceRGB', '', 'Image39', 'FlateDecode')
Unexpected data format for image: (40, 0, 1332, 359, 8, 'DeviceRGB', '', 'Image40', 'FlateDecode')
Unexpected data format for image: (43, 0, 1426, 1812, 8, 'DeviceRGB', '', 'Image43', 'FlateDecode')
Unexpected data format for image: (46, 0, 1431, 1827, 8, 'DeviceRGB', '', 'Image46', 'FlateDecode')
Unexpected data format for image: (49, 0, 1430, 1252, 8, 'DeviceRGB', '', 'Image49', 'FlateDecode')
Unexpected data format for image: (62, 0, 1363, 1097, 8, 'DeviceRGB', '', 'Image62', 'FlateDecode')
Unexpected data format for image: (65, 0, 1426, 1205, 8, 'DeviceRGB', '', 'Image65', 'FlateDecode')
Unexpected data format for image: (66, 0, 1426, 718, 8

AttributeError: 'SlideShapes' object has no attribute 'pictures'

In [None]:
###CODE TO SHOW IMAGES IN NOTEBOOK
from PIL import Image
import matplotlib.pyplot as plt
import os


def plot_images(image_paths):
    images_shown = 0
    plt.figure(figsize=(16, 9))
    for img_path in image_paths:
        if os.path.isfile(img_path):
            image = Image.open(img_path)

            plt.subplot(2, 3, images_shown + 1)
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

            images_shown += 1
            if images_shown >= 9:
                break
# show sources
from llama_index.core.response.notebook_utils import display_source_node

for text_node in response.metadata["text_nodes"]:
    display_source_node(text_node, source_length=200)
plot_images(
    [n.metadata["file_path"] for n in response.metadata["image_nodes"]]
)