In [1]:
# download ganjoor db
import os
import zipfile
import requests

if not os.path.exists('ganjoor.s3db'):
    url = 'https://github.com/ganjoor/desktop/releases/download/v2.81/ganjoor.s3db.zip'
    response = requests.get(url)
    with open('ganjoor.zip', 'wb') as f:
        f.write(response.content)
    with zipfile.ZipFile('ganjoor.zip', 'r') as zip_ref:
        zip_ref.extractall('ganjoor.s3db')

In [1]:
# load poems
import sqlite3

connection = sqlite3.connect('ganjoor.s3db')
cursor = connection.cursor()

query = """SELECT  group_concat(verse.text, '           ') AS verses_concatenated, cat.text, poet.name
FROM verse
JOIN poem ON verse.poem_id = poem.id
JOIN cat ON poem.cat_id = cat.id
JOIN poet ON cat.poet_id = poet.id
GROUP BY poem.id;
"""
cursor.execute(query)
results = cursor.fetchall()
print(len(results))
connection.close()



63785


In [4]:
# embed poems
from sentence_transformers import SentenceTransformer

texts_to_embed = ["    ".join(result) for result in results][:10]

model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")  # device="cuda"

embeddings = model.encode(texts_to_embed)


In [26]:
# Connect to Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

client = QdrantClient(host="localhost", port=6333)
if not client.collection_exists('ganjoor'):
    client.create_collection(
        collection_name='ganjoor',
        vectors_config=VectorParams(
            size=768,
            distance=Distance.COSINE,
        ),
    )

In [27]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct

for i, (sentence, embedding) in enumerate(zip(texts_to_embed, embeddings)):
    points = [PointStruct(
        id=i,
        vector=embedding.astype('float32').tolist(),
        payload={
            "sentence": sentence
        },
    )
    ]
    # Upload points to a collection
    client.upsert(collection_name="ganjoor", points=points)



In [35]:
# query = "حافظ رفت میکده یه نفر بهش گفت که برو خودت رو بشور بعد بیا اینجا که آلوده نشه"
# query = "مست رو سرزنش نکن"


def search(query, limit=1):
    query_vector = model.encode(query)

    # Search in the Qdrant collection
    response = client.search(
        collection_name="ganjoor",
        query_vector=query_vector,
        limit=limit
    )
    return response[0].payload['sentence']


In [36]:
import gradio as gr


demo = gr.Interface(
    fn=search,
    inputs=["text", ],
    outputs=["text"],
)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


