In [None]:
#%pip install llama_index ftfy regex tqdm
#%pip install git+https://github.com/openai/CLIP.git
#%pip install torch torchvision
#%pip install matplotlib scikit-image
#%pip install -U qdrant_client
#%pip install wikipedia
#%pip install llama_index
#%pip install arxiv
#!pip install fitz
#!pip install frontend
#!pip install pymupdf
#!pip install pypdf


In [None]:
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import os
import qdrant_client
from llama_index import (
    ServiceContext,
    SimpleDirectoryReader,
)
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index import VectorStoreIndex, StorageContext
from llama_index.indices.multi_modal.base import MultiModalVectorStoreIndex

data_path = Path("static/data_wiki")

### Only if you need to download data from the web

In [None]:
from pathlib import Path
import requests

wiki_titles = [
    "Natural_language_processing",
    "Vector_database",
    "Large_language_model",
    "batman",
    "Vincent van Gogh",
    "San Francisco",
    "iPhone",
    "Tesla Model S",
    "BTS",
    "Transformer",
]



for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True,
        },
    ).json()
    print(title)
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

In [None]:
import wikipedia
import urllib.request

image_uuid = 0
# image_metadata_dict stores images metadata including image uuid, filename and path
image_metadata_dict = {}
MAX_IMAGES_PER_WIKI = 30

wiki_titles = [
    "Natural_language_processing",
    "Vector_database",
    "Large_language_model",
    "San Francisco",
    "Batman",
    "Vincent van Gogh",
    "iPhone",
    "Tesla Model S",
    "BTS band",
    "Transformer",
]

# create folder for images only
if not image_path.exists():
    Path.mkdir(image_path)


# Download images for wiki pages
# Assing UUID for each image
for title in wiki_titles:
    images_per_wiki = 0
    print(title)
    try:
        page_py = wikipedia.page(title)
        list_img_urls = page_py.images
        for url in list_img_urls:
            if url.endswith(".jpg") or url.endswith(".png"):
                image_uuid += 1
                image_file_name = title + "_" + url.split("/")[-1]

                # img_path could be s3 path pointing to the raw image file in the future
                image_metadata_dict[image_uuid] = {
                    "filename": image_file_name,
                    "img_path": "./" + str(image_path / f"{image_uuid}.jpg"),
                }
                urllib.request.urlretrieve(
                    url, image_path / f"{image_uuid}.jpg"
                )
                images_per_wiki += 1
                # Limit the number of images downloaded per wiki page to 15
                if images_per_wiki > MAX_IMAGES_PER_WIKI:
                    break
    except:
        print(str(Exception("No images found for Wikipedia page: ")) + title)
        continue

In [None]:
!wget --user-agent "Mozilla" https://arxiv.org/pdf/2305.09288.pdf -O "static/data_wiki/TSC.pdf"
!wget --user-agent "Mozilla" https://arxiv.org/pdf/2402.04853.pdf -O "static/data_wiki/llmIR.pdf"
!wget --user-agent "Mozilla" https://arxiv.org/pdf/2305.09288.pdf -O "static/static/static/static/static/static/static/static/static/static/static/static/static/static/static/static/static/static/static/data_wiki/llama2.pdf"
import arxiv

# Construct the default API client.
client = arxiv.Client()

# Search for the 10 most recent articles matching the keyword "quantum."
search = arxiv.Search(
  query = "retriever",
  max_results = 20,
  sort_by = arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

# `results` is a generator; you can iterate over its elements one by one...
for r in client.results(search):
  print(r.title)
  r.download_pdf(dirpath="./static/data_wiki", filename=f"{r.title}.pdf")
  #print(r.title)
# ...or exhaust it into a list. Careful: this is slow for large results sets.

### create or download the vector database

In [None]:

# Create a local Qdrant vector store
client = qdrant_client.QdrantClient(path="qdrant_db")

text_store = QdrantVectorStore(
    client=client, collection_name="text_collection"
)
image_store = QdrantVectorStore(
    client=client, collection_name="image_collection"
)




### To load the model 

In [None]:
# use Huggingface embeddings
from llama_index.embeddings import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
service_context = ServiceContext.from_defaults(chunk_size=1024, #llm=llm,
                                                llm=None,
                                                embed_model=embed_model,)

# to load
index=MultiModalVectorStoreIndex.from_vector_store(vector_store=text_store,service_context=service_context, image_vector_store=image_store)
# load index


## To create new vector database for your retriever 

In [None]:

storage_context = StorageContext.from_defaults(
    vector_store=text_store, image_store=image_store
)

# Create the MultiModal index
documents = SimpleDirectoryReader("./static/data_wiki/").load_data()
index = MultiModalVectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    service_context=service_context
)

### To load your model already cleated 

In [None]:
def plot_images(image_paths):
    images_shown = 0
    plt.figure(figsize=(16, 9))
    for img_path in image_paths:
        if os.path.isfile(img_path):
            image = Image.open(img_path)

            plt.subplot(2, 3, images_shown + 1)
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

            images_shown += 1
            if images_shown >= 9:
                break

### Example

In [None]:
test_query = "what is a large language model?"
test_query="who is BTS?"
test_query = "what are Vincent van Gogh's famous paintings"
test_query = "what is a transformer?"
test_query = "what is the popular tourist attraction in San Francisco"


In [None]:
from llama_index.response.notebook_utils import display_source_node
from llama_index.schema import ImageNode
test_query = "who is BTS?"


# generate  retrieval results
def show(test_query):
    retriever = index.as_retriever(similarity_top_k=3, image_similarity_top_k=3)
    retrieval_results = retriever.retrieve(test_query)
    retrieved_image = []
    for res_node in retrieval_results:
        if isinstance(res_node.node, ImageNode):
            retrieved_image.append(res_node.node.metadata["file_path"])
            #print("File_path:",res_node.node.metadata["file_path"],"Score:",res_node.score )
        else:
            display_source_node(res_node, source_length=200)
            print("*File_path:",res_node.node.metadata["file_path"])
            print("-------------------------------------------------------------------------------------------")
    print(retrieved_image)
    plot_images(retrieved_image)
show(test_query)