## 1. SQL and vector search

In [42]:
from sqlalchemy.engine import URL
from pgvector.sqlalchemy import Vector
from typing import List, Optional
from sqlalchemy import Integer, String, select
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column

In [None]:
db_url = URL.create(
    drivername="postgresql+psycopg",
    username="postgres",
    password="password",
    host="localhost",
    port=5557,
    database="similarity_search_service_db"
)

In [None]:
# Create the base class for the table definition
class Base(DeclarativeBase):
    __abstract__ = True


# Create the table definition
class Images(Base):
    __tablename__ = "images"
    VECTOR_LENGTH = 512
    
    # primary key
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    # image path - we will use it to store the path to the image file, after similarity search we can use it to retrieve the image and display it
    image_path: Mapped[str] = mapped_column(String(256))
    # image embedding - we will store the image embedding in this column, the image embedding is a list of 512 floats this is the output of the sentence transformer model
    image_embedding: Mapped[List[float]] = mapped_column(Vector(VECTOR_LENGTH))

In [23]:
from sqlalchemy import create_engine

engine = create_engine(db_url)

In [24]:
Base.metadata.create_all(engine)

In [25]:
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, Session

# reusable function to insert data into the table
def insert_image(engine: sqlalchemy.Engine, image_path: str, image_embedding: list[float]):
    with Session(engine) as session:
        # create the image object
        image = Images(
            image_path=image_path,
            image_embedding=image_embedding
        )
        # add the image object to the session
        session.add(image)
        # commit the transaction
        session.commit()

# insert some data into the table
N = 100
for i in range(N):
    image_path = f"image_{i}.jpg"
    image_embedding = np.random.rand(512).tolist()
    insert_image(engine, image_path, image_embedding)

# select first image from the table
with Session(engine) as session:
    image = session.query(Images).first()


# calculate the cosine similarity between the first image and the K rest of the images, order the images by the similarity score
def find_k_images(engine, k: int, orginal_image: Images):
    with Session(engine) as session:
        result = session.execute(
            select(Images)
            .order_by(Images.image_embedding.cosine_distance(orginal_image.image_embedding))
            .limit(k), 
            execution_options={"prebuffer_rows": True}
        )
        return list(result.scalars().all())

# find the 10 most similar images to the first image
k = 10
similar_images = find_k_images(engine, k, image)

In [29]:
[img.image_path for img in similar_images]

['image_0.jpg',
 'image_63.jpg',
 'image_64.jpg',
 'image_18.jpg',
 'image_69.jpg',
 'image_60.jpg',
 'image_26.jpg',
 'image_40.jpg',
 'image_80.jpg',
 'image_51.jpg']

In [31]:
# find the images with the similarity score greater than 0.9
def find_images_with_similarity_score_greater_than(engine: sqlalchemy.Engine, similarity_score: float, orginal_image: Images) -> list[Images]:
    with Session(engine) as session:
        result = session.execute(
            select(Images)
            .filter(Images.image_embedding.cosine_similarity(orginal_image.image_embedding) > similarity_score), 
            execution_options={"prebuffer_rows": True}
        )
        return result

In [32]:
from datasets import load_dataset

dataset = load_dataset("FronkonGames/steam-games-dataset")

# get columns names and types
columns = dataset["train"].features
print(columns)

columns_to_keep = ["Name", "Windows", "Linux", "Mac", "About the game", "Supported languages", "Price"]

N = 40000
dataset = dataset["train"].select_columns(columns_to_keep).select(range(N))

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-e2ed184370a069(…):   0%|          | 0.00/123M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/83560 [00:00<?, ? examples/s]

{'AppID': Value('int64'), 'Name': Value('string'), 'Release date': Value('string'), 'Estimated owners': Value('string'), 'Peak CCU': Value('int64'), 'Required age': Value('int64'), 'Price': Value('float64'), 'DLC count': Value('int64'), 'About the game': Value('string'), 'Supported languages': Value('string'), 'Full audio languages': Value('string'), 'Reviews': Value('string'), 'Header image': Value('string'), 'Website': Value('string'), 'Support url': Value('string'), 'Support email': Value('string'), 'Windows': Value('bool'), 'Mac': Value('bool'), 'Linux': Value('bool'), 'Metacritic score': Value('int64'), 'Metacritic url': Value('string'), 'User score': Value('int64'), 'Positive': Value('int64'), 'Negative': Value('int64'), 'Score rank': Value('float64'), 'Achievements': Value('int64'), 'Recommendations': Value('int64'), 'Notes': Value('string'), 'Average playtime forever': Value('int64'), 'Average playtime two weeks': Value('int64'), 'Median playtime forever': Value('int64'), 'Medi

In [36]:
from sqlalchemy import Integer, Float, Boolean


class Games(Base):
    __tablename__ = "games"
    __table_args__ = {'extend_existing': True}
    
    # the vector size produced by the model taken from documentation https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2
    VECTOR_LENGTH = 512
        
    id: Mapped[int] = mapped_column(Integer, primary_key=True)
    name: Mapped[str] = mapped_column(String(256))
    description: Mapped[str] = mapped_column(String(4096))
    windows: Mapped[bool] = mapped_column(Boolean)
    linux: Mapped[bool] = mapped_column(Boolean)
    mac: Mapped[bool] = mapped_column(Boolean)
    price: Mapped[float] = mapped_column(Float)
    game_description_embedding: Mapped[List[float]] = mapped_column(Vector(VECTOR_LENGTH))

Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

In [37]:
from sentence_transformers import SentenceTransformer


checkpoint = "distiluse-base-multilingual-cased-v2"
model = SentenceTransformer(checkpoint)


def generate_embeddings(text: str) -> list[float]:
    return model.encode(text)

modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [48]:
from tqdm import tqdm


def insert_games(engine, dataset):
    with tqdm(total=len(dataset)) as pbar:
        for i, game in enumerate(dataset):
            game_description = game["About the game"] or ""
            game_embedding = generate_embeddings(game_description)
            name, windows, linux, mac, price = game["Name"], game["Windows"], game["Linux"], game["Mac"], game["Price"]
            if name and windows and linux and mac and price and game_description:
                game = Games(
                    name=game["Name"], 
                    description=game_description[0:4096],
                    windows=game["Windows"], 
                    linux=game["Linux"], 
                    mac=game["Mac"], 
                    price=game["Price"], 
                    game_description_embedding=game_embedding
                )
                with Session(engine) as session:
                    session.add(game)
                    session.commit()
            pbar.update(1)

insert_games(engine, dataset)

100%|██████████| 40000/40000 [1:14:12<00:00,  8.98it/s]


In [49]:
def find_game(
    engine: sqlalchemy.Engine, 
    game_description: str, 
    windows: Optional[bool] = None, 
    linux: Optional[bool] = None,
    mac: Optional[bool] = None,
    price: Optional[int] = None
):
    with Session(engine) as session:
        game_embedding = generate_embeddings(game_description)
        query = (
            select(Games)
            .order_by(Games.game_description_embedding.cosine_distance(game_embedding))
        )
        
        if price:
            query = query.filter(Games.price <= price)
        if windows:
            query = query.filter(Games.windows == True)
        if linux:
            query = query.filter(Games.linux == True)
        if mac:
            query = query.filter(Games.mac == True)
        
        result = session.execute(query, execution_options={"prebuffer_rows": True})
        game = result.scalars().first()
        
        return game

In [50]:
game = find_game(engine, "This is a game about a hero who saves the world", price=10)
print(f"Game: {game.name}")
print(f"Description: {game.description}")

game = find_game(engine, game_description="Home decorating", price=20)
print(f"Game: {game.name}")
print(f"Description: {game.description}")

Game: Ultimate Spider Hero
Description: Ultimate Spider Hero game was designed for real heroes! Your mission is to help poor residents of the Metropolis and to save them from the terrible monsters. Move forward to fight your enemies and try not to fall! Features: Simple and addictive gameplay Nice graphics Awesome Ultimate Spider Hero Countless Steam achievements for you to collect! Compatibility with multiple major platforms (Windows, Mac, Linux, SteamOS) Make your way through the endless labyrinths of long, confusing city streets together with your favorite hero from countless movies and cartoons! Although this may look simple enough, things are not as easy as they seem. You will have to learn how to cling into houses properly using your web, otherwise you will fall to your demise. If you manage to do so - you will become a real superhero, armed with elusiveness, agility and speed and the ability to tirelessly swing across the rooftops and between the huge skyscrapers this urban land

## 2. RAG

In [1]:
from pymilvus import MilvusClient

host = "localhost"
port = "19530"

milvus_client = MilvusClient(
    host=host,
    port=port
)

In [2]:
from pymilvus import FieldSchema, DataType, CollectionSchema

VECTOR_LENGTH = 768  # check the dimensionality for Silver Retriever Base (v1.1) model

id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, description="Primary id")
text = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=4096, description="Page text")
embedding_text = FieldSchema("embedding", dtype=DataType.FLOAT_VECTOR, dim=VECTOR_LENGTH, description="Embedded text")

fields = [id_field, text, embedding_text]

schema = CollectionSchema(fields=fields, auto_id=True, enable_dynamic_field=True, description="RAG Texts collection")

In [3]:
COLLECTION_NAME = "rag_texts_and_embeddings"

milvus_client.create_collection(
    collection_name=COLLECTION_NAME,
    schema=schema
)

index_params = milvus_client.prepare_index_params()

index_params.add_index(
    field_name="embedding", 
    index_type="HNSW",
    metric_type="L2",
    params={"M": 4, "efConstruction": 64}  # lower values for speed
) 

milvus_client.create_index(
    collection_name=COLLECTION_NAME,
    index_params=index_params
)

# checkout our collection
print(milvus_client.list_collections())

# describe our collection
print(milvus_client.describe_collection(COLLECTION_NAME))

['rag_texts_and_embeddings']
{'collection_name': 'rag_texts_and_embeddings', 'auto_id': True, 'num_shards': 1, 'description': 'RAG Texts collection', 'fields': [{'field_id': 100, 'name': 'id', 'description': 'Primary id', 'type': <DataType.INT64: 5>, 'params': {}, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'text', 'description': 'Page text', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 4096}}, {'field_id': 102, 'name': 'embedding', 'description': 'Embedded text', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'functions': [], 'aliases': [], 'collection_id': 462220191432704077, 'consistency_level': 2, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True, 'created_timestamp': 462220195133915140}


In [4]:
# define data source and destination
## the document origin destination from which document will be downloaded 
pdf_url = "https://www.iab.org.pl/wp-content/uploads/2024/04/Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.pdf"

## local destination of the document
file_name = "Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.pdf"

## local destination of the processed document 
file_json = "Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska.json"

## local destination of the embedded pages of the document
embeddings_json = "Przewodnik-po-sztucznej-inteligencji-2024_IAB-Polska-Embeddings.json"

## local destination of all above local required files
data_dir = "./milvus_db/data"

In [6]:
# download data
import os
import requests

def download_pdf_data(pdf_url: str, file_name: str) -> None:
    response = requests.get(pdf_url, stream=True)
    with open(os.path.join(data_dir, file_name), "wb") as file:
        for block in response.iter_content(chunk_size=1024):
            if block:
                file.write(block)

download_pdf_data(pdf_url, file_name)

In [7]:
# prepare data

import fitz
import json


def extract_pdf_text(file_name, file_json):
    document = fitz.open(os.path.join(data_dir, file_name))
    pages = []

    for page_num in range(len(document)):
        page = document.load_page(page_num)
        page_text = page.get_text()
        pages.append({"page_num": page_num, "text": page_text})

    with open(os.path.join(data_dir, file_json), "w") as file:
        json.dump(pages, file, indent=4, ensure_ascii=False)

extract_pdf_text(file_name, file_json)

In [8]:
# vectorize data

import torch
import numpy as np
from sentence_transformers import SentenceTransformer


def generate_embeddings(file_json, embeddings_json, model):
    pages = []
    with open(os.path.join(data_dir, file_json), "r") as file:
        data = json.load(file)

    for page in data:
        pages.append(page["text"])

    embeddings = model.encode(pages)

    embeddings_paginated = []
    for page_num in range(len(embeddings)):
        embeddings_paginated.append({"page_num": page_num, "embedding": embeddings[page_num].tolist()})

    with open(os.path.join(data_dir, embeddings_json), "w") as file:
        json.dump(embeddings_paginated, file, indent=4, ensure_ascii=False)

model_name = "ipipan/silver-retriever-base-v1.1"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name, device=device)
generate_embeddings(file_json, embeddings_json, model)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/144 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
def insert_embeddings(file_json, embeddings_json, client=milvus_client):
    rows = []
    with open(os.path.join(data_dir, file_json), "r") as t_f, open(os.path.join(data_dir, embeddings_json), "r") as e_f:
        text_data, embedding_data = json.load(t_f), json.load(e_f)
        text_data =  list(map(lambda d: d["text"], text_data))
        embedding_data = list(map(lambda d: d["embedding"], embedding_data))
        
        for page, (text, embedding) in enumerate(zip(text_data, embedding_data)):
            rows.append({"text":text, "embedding": embedding})

    client.insert(collection_name="rag_texts_and_embeddings", data=rows)


insert_embeddings(file_json, embeddings_json)

# load inserted data into memory
milvus_client.load_collection("rag_texts_and_embeddings")

In [30]:
# search
def search(model, query, client=milvus_client):
    embedded_query = model.encode(query).tolist()
    result = client.search(
        collection_name="rag_texts_and_embeddings", 
        data=[embedded_query], 
        limit=1,
        search_params={"metric_type": "L2"},
        output_fields=["text"]
    )
    return result


result = search(model, query="Czym jest sztuczna inteligencja")

In [23]:
result[0]

[{'id': 462220191432705404, 'distance': 29.125164031982422, 'entity': {'text': 'Historia powstania\nsztucznej inteligencji\n7\nW języku potocznym „sztuczny" oznacza to, co\njest \nwytworem \nmającym \nnaśladować \ncoś\nnaturalnego. W takim znaczeniu używamy\nterminu ,,sztuczny\'\', gdy mówimy o sztucznym\nlodowisku lub oku. Sztuczna inteligencja byłaby\nczymś (programem, maszyną) symulującym\ninteligencję naturalną, ludzką.\nSztuczna inteligencja (AI) to obszar informatyki,\nktóry skupia się na tworzeniu programów\nkomputerowych zdolnych do wykonywania\nzadań, które wymagają ludzkiej inteligencji. \nTe zadania obejmują rozpoznawanie wzorców,\nrozumienie języka naturalnego, podejmowanie\ndecyzji, uczenie się, planowanie i wiele innych.\nGłównym celem AI jest stworzenie systemów,\nktóre są zdolne do myślenia i podejmowania\ndecyzji na sposób przypominający ludzki.\nHistoria sztucznej inteligencji sięga lat 50. \nXX wieku, kiedy to powstały pierwsze koncepcje\ni modele tego, co mogłoby st

In [None]:
import os
from google import genai

GEMINI_KEY = os.getenv("GEMINI_API_KEY")
gemini_client = genai.Client(api_key=GEMINI_KEY)

MODEL = "gemini-2.0-flash"

def generate_response(prompt: str):
    try:
        # Send request to Gemini 2.0 Flash API and get the response
        response = gemini_client.models.generate_content(
            model=MODEL,
            contents=prompt,
        )
        return response.text 
    except Exception as e:
        print(f"Error generating response: {e}")
        return None

In [33]:
def build_prompt(context: str, query: str) -> str:
    prompt = f"""
Odpowiedz na pytanie na podstawie kontekstu.

Kontekst:
{context}

Pytanie:
{query}
"""
    return prompt
    

def rag(model, query: str) -> str:
    # run search in vector DB
    search_results = search(model, query)
    context = search_results[0][0].entity.get("text")
    
    # build prompt for Gemini
    prompt = build_prompt(context, query)
    
    # get LLM answer
    response = generate_response(prompt)
    return response

### 2.1. Sprawdzenie

#### 2.1.1. Sprawdzenie czy odczytuje dokument

In [36]:
rag(model, "Czym jest sztuczna inteligencja?")

'Sztuczna inteligencja (AI) to obszar informatyki, który skupia się na tworzeniu programów komputerowych zdolnych do wykonywania zadań, które wymagają ludzkiej inteligencji, takich jak rozpoznawanie wzorców, rozumienie języka naturalnego, podejmowanie decyzji, uczenie się i planowanie. Celem jest stworzenie systemów, które potrafią myśleć i podejmować decyzje na sposób przypominający ludzki. W potocznym znaczeniu, sztuczna inteligencja to coś (program, maszyna) symulujące inteligencję naturalną, ludzką.\n'

#### 2.1.2. Sprawdzenie czy nie halucynuje

In [38]:
rag(model, "Wymień prezydentów USA po 1945 roku.")

'Na podstawie dostarczonego kontekstu, nie można wymienić prezydentów USA po 1945 roku. Tekst dotyczy sztucznej inteligencji (AI) i różnych perspektyw na jej rozwój, a nie historii Stanów Zjednoczonych.\n'