In [None]:
from config import Config
from pipeline import YargitayPipeline
import pandas as pd
import json
from processor import YargitaySemanticProcessor

def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="try",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )
    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("🏛️ YARGITAY KARARLARI SORGU SİSTEMİ")
        print("="*60)
        print("1) Full pipeline çalıştır (CSV -> chunks -> embed -> qdrant)")
        print("2) İnteraktif arama")
        print("3) Koleksiyon bilgilerini göster")
        print("4) Çıkış")
        choice = input("Seçiminiz (1-4): ").strip()
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            pipeline.full_pipeline(csv_path)
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            pipeline.semantic()
        elif choice=="5":
            print("👋 Görüşürüz")
            break
        else:
            print("❌ Geçersiz seçim")

if __name__ == "__main__":
    main()


In [29]:
from config import Config
from qdrant_client import QdrantClient, models
from FlagEmbedding import BGEM3FlagModel
from qdrant_client. models import Prefetch, FusionQuery, Fusion, NamedVector, NamedSparseVector, SparseVector
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi

In [30]:
config = Config(
    CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
    TOKEN_SIZE=512,
    QDRANT_URL="http://localhost:6333",
    COLLECTION_NAME="try",
    EMBEDDING_DIM=512,
    BATCH_SIZE=100
)

In [31]:
qdrant_client = QdrantClient(url=config.QDRANT_URL)
bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 143640.55it/s]


In [32]:
query = "ihtiyati tedbir tazminat nedir"
embedding = bge_model.encode(
    [query],
    return_dense=True,
    return_sparse=True
)
embedding

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'dense_vecs': array([[-0.03183 ,  0.004234, -0.001104, ..., -0.02583 , -0.0553  ,
         -0.000913]], shape=(1, 1024), dtype=float16),
 'lexical_weights': [defaultdict(int,
              {'9633': np.float16(0.1284),
               '36164': np.float16(0.2197),
               '118': np.float16(0.1167),
               '200990': np.float16(0.304),
               '308': np.float16(0.07996),
               '169': np.float16(0.1237),
               '10593': np.float16(0.1879),
               '18': np.float16(0.10645),
               '77116': np.float16(0.1637)})],
 'colbert_vecs': None}

In [33]:
q_dense = embedding.get("dense_vecs", [[0.0]*config.EMBEDDING_DIM])[0]
q_dense = q_dense[:config.EMBEDDING_DIM]
q_dense

array([-3.1830e-02,  4.2343e-03, -1.1044e-03, -3.5248e-02, -3.0899e-02,
       -2.3285e-02,  6.1737e-02,  4.7607e-03, -2.0790e-03, -3.6285e-02,
       -3.6133e-02,  9.4757e-03, -8.3252e-02,  1.9897e-02,  1.8005e-02,
        9.8419e-03,  3.5156e-02,  1.0460e-02,  3.9673e-02, -4.5929e-02,
       -3.6652e-02, -3.1113e-02, -2.8667e-03,  1.9436e-03, -2.6260e-02,
        2.8503e-02,  2.9144e-02,  2.3651e-02,  3.7140e-02, -3.6163e-02,
        3.6278e-03, -6.0272e-03, -3.2401e-04,  1.2779e-03, -1.2573e-02,
        4.2023e-02, -1.9989e-02, -6.6956e-02, -2.3132e-02,  2.4261e-02,
        4.6310e-03,  8.2474e-03,  7.3738e-03, -1.7319e-02,  2.2812e-02,
       -4.0405e-02,  6.7635e-03, -2.5833e-02, -1.2917e-02, -4.8218e-02,
       -1.0612e-02, -6.2073e-02,  6.2347e-02, -4.8187e-02,  2.5940e-02,
        4.1229e-02,  7.4463e-03,  3.0121e-02, -5.2643e-02, -9.0179e-03,
       -3.8605e-02, -3.2196e-02, -1.1435e-03, -1.0231e-02,  1.8356e-02,
        4.0283e-02, -3.4928e-04,  1.5121e-02, -6.4850e-03, -1.66

In [None]:

corpus = [
    "bugün hava çok  güzel ve güneşli "
]
bm25 = BM25Okapi(corpus)

qry = [query]
scores = bm25.get_scores(qry)
print(scores)
bm25

[0.]


<rank_bm25.BM25Okapi at 0x76e3fed18a60>

In [None]:
corpus = [
    "bugün hava çok o güzel ve güneşli jslkgvj dkvdfnjfnv aaaaa"
]
vectorizer = TfidfVectorizer()
X_sparse = vectorizer.fit_transform(corpus)
sparse_vectors = []
for row in X_sparse:
    row_coo = row.tocoo()
    sparse_vectors.append({"indices": row_coo.col.tolist(), "values": row_coo.data.tolist()})

sparse_vectors

In [None]:
x_sparse = vectorizer.fit_transform([query])  # sadece sorgu texti
row = x_sparse[0].tocoo()

sparse_vector = NamedSparseVector(
    name="sparse_vec",
    vector=SparseVector(
        indices=row.col.tolist(),
        values=row.data.tolist()
    )
)

In [None]:
query_dense = NamedVector(name="dense_vec",vector=q_dense)
#sparse_vector = NamedVector(name="sparse_vec",vetor=sparse_vectors)

In [None]:
search_result = qdrant_client.query_points(
    collection_name=config.COLLECTION_NAME,
    prefetch=[
        models.Prefetch(
            query = sparse_vector,
            using = "sparse_vec",
            
            limit = 5,
        ),
        models.Prefetch(
            query = query_dense,
            using = "dense_vec",
            
            limit = 5
        ),
    ],
    query=FusionQuery(fusion=Fusion.RRF),  
    

)

In [None]:
search_result = qdrant_client.query_points(
    collection_name=config.COLLECTION_NAME,
    prefetch=prefetch,
    query=FusionQuery(fusion=Fusion.RRF),  # RRF fusion
    limit=5,
    score_threshold=0.6,
    with_payload=True
)