In [1]:
import asyncio
import os
import requests
import json
from dotenv import load_dotenv
from time import sleep
from typing import Any, Awaitable, Dict, List
from openai import AsyncOpenAI
from qdrant_client import AsyncQdrantClient, models

In [2]:
load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
AIDEVS_API_KEY = os.environ["AIDEVS_API_KEY"]
COLLECTION_NAME = "links_archive"
VECTORS_DIM = 1536

In [3]:
!docker-compose up -d

[1A[1B[0G[?25l[+] Building 0.0s (0/0)                                    docker:desktop-linux
[?25h[1A[1B[0G[?25l[+] Running 2/0
 [32m✔[0m Network c03l04_default  [32mCreated[0m                                         [34m0.0s [0m
 [32m✔[0m Container qdrant        [32mCreated[0m                                         [34m0.0s [0m
[?25h[1A[1A[1A[0G[?25l[34m[+] Running 2/2[0m
 [32m✔[0m Network c03l04_default  [32mCreated[0m                                         [34m0.0s [0m
 [32m✔[0m Container qdrant        [32mCreated[0m                                         [34m0.0s [0m
[?25h[1A[1A[1A[0G[?25l[34m[+] Running 2/2[0m
 [32m✔[0m Network c03l04_default  [32mCreated[0m                                         [34m0.0s [0m
 [32m✔[0m Container qdrant        [32mStarted[0m                                         [34m0.0s [0m
[?25h

In [4]:
openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
qdrant_client = AsyncQdrantClient(host="localhost", port="6333")

In [5]:
links_archive = []

response = requests.post("https://unknow.news/archiwum.json")
links_archive += json.loads(response.content)

response = requests.post("https://unknow.news/archiwum_aidevs.json")
links_archive += json.loads(response.content)

print(links_archive[0], end="\n\n")
print(len(links_archive))

{'title': 'Demaskowanie kłamstw Devina - jak to było ze zleceniami z Upwork?', 'url': 'https://www.youtube.com/watch?v=tNmgmwEtoWE', 'info': 'INFO: Devin, zwany "pierwszym, napędzanym przez AI, prawdziwym programistą", nie działa do końca tak, jak to przedstawiono. Jego twórcy opublikowali film, w którym automat wykonuje zlecenia opublikowane na platformie Upwork i dostaje za nie pieniądze od zleceniodawców. Autor filmu, do którego linkują, rozłożył to nagranie reklamowe na części i przeanalizował je, momentami nawet klatka po klatce. Wyłowił, co Devin miał zrobić, a co naprawdę zrobił, i omówił, jak kiepsko mu to wyszło. Wygląda na to, że firma nie prezentuje do końca uczciwie możliwości swojego oprogramowania, wprowadzając użytkowników (i potencjalnych inwestorów) w błąd.', 'date': '2024-04-19'}

7075


In [6]:
async def aprep_vector_db(aclient: AsyncQdrantClient) -> Awaitable[None]:
    exists = await aclient.collection_exists(COLLECTION_NAME)
    if not exists:
        await aclient.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=models.VectorParams(
                size=VECTORS_DIM,
                distance=models.Distance.COSINE,
            ),
        )

In [7]:
async def aprep_vector_point(aclient: AsyncOpenAI, idx: int, doc: Dict) -> Awaitable[models.PointStruct]:
    input = doc["info"][6:]
    response = await aclient.embeddings.create(
        input=input,
        model="text-embedding-ada-002"
    )
    return models.PointStruct(
        id=idx, vector=response.data[0].embedding, payload=doc
    )

In [8]:
async def main() -> Awaitable[bool]:
    await aprep_vector_db(qdrant_client)

    # Due to rate limits for text-embedding-ada-002 we need to split archives into smaller chunks   
    vc_points: List[models.PointStruct] = []
    i_prev = 0
    last = len(links_archive)
    step = 500
    for i in range (0, last, step):
        if i == 0:
            continue
        
        print(f"Getting embeddings for docs from index {i_prev} to {i}")
        chunk = links_archive[i_prev:i]
        vc_point_tasks = [aprep_vector_point(openai_client, idx, doc) for idx, doc in enumerate(chunk)]
        vc_points += await asyncio.gather(*vc_point_tasks)
        
        i_prev = i
        sleep(10)
        
        if last - i < step:
            print(f"Getting embeddings for last docs")
            chunk = links_archive[i:last+1]
            vc_point_tasks = [aprep_vector_point(openai_client, idx, doc) for idx, doc in enumerate(chunk)]
            vc_points += await asyncio.gather(*vc_point_tasks)
    
    await qdrant_client.upload_points(COLLECTION_NAME, points=vc_points)
    return True

In [9]:
loop = asyncio.get_event_loop()
# loop.create_task(main())
coroutine = asyncio.run_coroutine_threadsafe(main(), loop)
coroutine.add_done_callback(lambda x: print(f"Main loop returned: {x.result()}"))

Getting embeddings for docs from index 0 to 500
Getting embeddings for docs from index 500 to 1000
Getting embeddings for docs from index 1000 to 1500
Getting embeddings for docs from index 1500 to 2000
Getting embeddings for docs from index 2000 to 2500
Getting embeddings for docs from index 2500 to 3000
Getting embeddings for docs from index 3000 to 3500
Getting embeddings for docs from index 3500 to 4000
Getting embeddings for docs from index 4000 to 4500
Getting embeddings for docs from index 4500 to 5000
Getting embeddings for docs from index 5000 to 5500
Getting embeddings for docs from index 5500 to 6000
Getting embeddings for docs from index 6000 to 6500
Getting embeddings for docs from index 6500 to 7000
Getting embeddings for last docs
Main loop returned: True


In [10]:
#### BELOW INSTRUCTIONS CAN BE EXECUTED ONLY IF THERE IS DATA IN THE VECTOR DB ####

In [11]:
async def test() -> Awaitable[None]:
    response = await openai_client.embeddings.create(
        input="Co różni pseudonimizację od anonimizowania danych?",
        model="text-embedding-ada-002"
    )
    test_query = response.data[0].embedding

    hits = await qdrant_client.search(collection_name=COLLECTION_NAME, query_vector=test_query, limit=3)
    for hit in hits:
        print(f"title: {hit.payload["title"]}, url: {hit.payload["url"]}, score: {hit.score}")

In [12]:
loop = asyncio.get_event_loop()
coroutine = asyncio.run_coroutine_threadsafe(test(), loop)
coroutine.add_done_callback(lambda x: print(f"Test loop returned: {x.result()}"))

title: Czym się różni pseudonimizacja od anonimizacji? (film, 46 minut), url: https://www.internet-czas-dzialac.pl/pseudonimizacja-a-anonimizacja/, score: 0.89958835
title: Jak śledzą nas strony internetowe partii politycznych? [Wybory 2023], url: https://www.internet-czas-dzialac.pl/wybory-2023-jak-partie-polityczne-szanuja-twoja-prywatnosc-na-swoich-stronach-internetowych/, score: 0.85214186
title: Społeczne oziębienie - efekt uboczny Big Data, url: https://www.socialcooling.com/, score: 0.84547544
Test loop returned: None


In [13]:
response = requests.post("https://tasks.aidevs.pl/token/search", json=dict(apikey=AIDEVS_API_KEY))
response_json = json.loads(response.text)
token = response_json["token"]

response = requests.get(f"https://tasks.aidevs.pl/task/{token}")
response_json = json.loads(response.text)

print(response_json)
question = response_json["question"]
answer = ""

{'code': 0, 'msg': 'Index all data from provided URL into vecto store and provide answer to my question - https://unknow.news/archiwum_aidevs.json', 'question': 'Co różni pseudonimizację od anonimizowania danych?'}


In [14]:
async def task() -> Awaitable[str]:
    global question
    global answer
    response = await openai_client.embeddings.create(
        input=question,
        model="text-embedding-ada-002"
    )
    question_query = response.data[0].embedding

    hits = await qdrant_client.search(collection_name=COLLECTION_NAME, query_vector=question_query, limit=3)
    
    answer = [hit.payload["url"] for hit in hits]
    return [f"title: {hit.payload["title"]}, url: {hit.payload["url"]}" for hit in hits]

In [15]:
loop = asyncio.get_event_loop()
coroutine = asyncio.run_coroutine_threadsafe(task(), loop)
coroutine.add_done_callback(lambda x: print(f"Task loop returned: {x.result()}"))

Task loop returned: ['title: Czym się różni pseudonimizacja od anonimizacji? (film, 46 minut), url: https://www.internet-czas-dzialac.pl/pseudonimizacja-a-anonimizacja/', 'title: Jak śledzą nas strony internetowe partii politycznych? [Wybory 2023], url: https://www.internet-czas-dzialac.pl/wybory-2023-jak-partie-polityczne-szanuja-twoja-prywatnosc-na-swoich-stronach-internetowych/', 'title: Społeczne oziębienie - efekt uboczny Big Data, url: https://www.socialcooling.com/']


In [16]:
for possible_answer in answer:
    response = requests.post(f"https://tasks.aidevs.pl/answer/{token}", json=dict(answer=possible_answer))
    response_json = json.loads(response.content)
    if response_json["code"] == 0:
        print(response_json)
        break

{'code': 0, 'msg': 'OK', 'note': 'CORRECT'}


In [17]:
!docker-compose down -v

[1A[1B[0G[?25l[+] Running 0/0
 ⠋ Container qdrant  Stopping                                              [34m0.1s [0m
[?25h[1A[1A[0G[?25l[34m[+] Running 1/1[0m
 [32m✔[0m Container qdrant        [32mRemoved[0m                                         [34m0.1s [0m
 ⠋ Network c03l04_default  Removing                                        [34m0.0s [0m
[?25h[1A[1A[1A[0G[?25l[34m[+] Running 2/2[0m
 [32m✔[0m Container qdrant        [32mRemoved[0m                                         [34m0.1s [0m
 [32m✔[0m Network c03l04_default  [32mRemoved[0m                                         [34m0.1s [0m
[?25h