# Essentials: Data Ingestion for Beginners

https://qdrant.tech/documentation/data-ingestion-beginners/

In [9]:
from fastembed import ImageEmbedding, TextEmbedding
from PIL import Image
from qdrant_client import QdrantClient, models

In [2]:
image_model = ImageEmbedding(model_name="Qdrant/clip-ViT-B-32-vision")
text_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [3]:
client = QdrantClient(":memory:")
client.create_collection(
    collection_name="products-data",
    vectors_config={
        "text_embedding": models.VectorParams(
            size=384,
            distance=models.Distance.COSINE,
        ),
        "image_embedding": models.VectorParams(
            size=512, distance=models.Distance.COSINE
        ),
    },
)

True

In [4]:
from langchain_community.document_loaders import DirectoryLoader

In [5]:
pdf_loader = DirectoryLoader("data", glob="**/*.pdf", show_progress=False)
pdfs = pdf_loader.load()

In [61]:
unique_id = 1

for idx, doc in enumerate(pdfs):
    unique_id += 1
    source = doc.metadata["source"]
    content = doc.page_content
    embedding = list(text_model.embed(content))[0]
    client.upload_points(
        collection_name="products-data",
        points=[
            models.PointStruct(
                id=unique_id,
                vector={"text_embedding": embedding},
                payload={"review": content, "source": source},
            )
        ],
    )

In [7]:
image_loader = DirectoryLoader(
    "images",
    glob=["**/*.png", "**/*.jpg"],
    show_progress=False,
)
images = image_loader.load()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [62]:
for idx, doc in enumerate(images):
    unique_id += 1
    source = doc.metadata["source"]
    image = Image.open(source)
    content = doc.page_content

    text_embedding = list(text_model.embed(content))[0]
    image_embedding = list(image_model.embed(image))[0]
    client.upload_points(
        collection_name="products-data",
        points=[
            models.PointStruct(
                id=unique_id,
                vector={
                    "text_embedding": embedding,
                    "image_embedding": image_embedding,
                },
                payload={"review": content, "source": source},
            )
        ],
    )

In [63]:
hits = client.query_points(
    collection_name="products-data",
    query=list(text_model.query_embed("Singapore Johor"))[0],
    using="text_embedding",
)

In [64]:
hits.points[0].payload["source"]

'data/Singapore winners.pdf'

In [65]:
hits = client.query_points(
    collection_name="products-data",
    query=list(image_model.embed(Image.open("images/wiki_computer_science.png")))[0],
    using="image_embedding",
)

In [66]:
hits.points[0].payload["source"]

'images/wiki_computer_science.png'