### full kod

In [1]:
# main.py
# SemChunk + BGE-M3 + Qdrant Entegrasyon (Dense + Sparse, 512 dim slice, L2 normalize, hibrit search)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict
import os
from qdrant_client import models
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from typing import List, Dict
from qdrant_client.models import NamedVector, NamedSparseVector, SparseVectorParams, SparseVector
from sklearn.feature_extraction.text import TfidfVectorizer
from qdrant_client.http.models import NamedVector, NamedSparseVector, SparseVector, SearchRequest


print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256
class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # Model
        print(f"🔮 BGE-M3 yükleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"✅ Hazır - Cihaz: {device_name}")

    # Test connection & print dense+sparse
    def test_bge_connection(self):
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            emb_res = self.bge_model.encode(test_text)
            dense = emb_res['dense_vecs'][0] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res[0]
            sparse_available = 'colbert_vecs' in emb_res
            print(f"✅ Dense embedding boyutu: {len(dense)}")
            print(f"🔍 Sparse embedding mevcut: {sparse_available}")
            return len(dense)
        except Exception as e:
            print(f"❌ BGE-M3 bağlantı hatası: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense + Sparse (sparse için yine 512 dim)
                vectors_config = {
                    "dense_vec": models.VectorParams(size=self.config.EMBEDDING_DIM, distance=models.Distance.COSINE),
                }
                sparse_config = {
                    "sparse_vec": models.SparseVectorParams(
                        index=models.SparseIndexParams(on_disk=False))
                }
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config,
                    sparse_vectors_config = sparse_config
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} (Dense+Sparse)")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"❌ Chunking hatası: {e}")
            return []

    def create_embeddings_bge(self, texts: List[str], batch_size: int = None):
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"🔮 {total} metin işleniyor (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                # Model dense embedding üret
                emb_res = self.bge_model.encode(
                    batch_texts,
                    return_dense=True,
                    return_sparse=True
                )
                dense = emb_res.get("dense_vecs", [[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])

                # Dense içinde None veya kısa vektör varsa düzelt
                dense_clean = []
                for vec in dense:
                    if vec is None:
                        dense_clean.append([0.0]*self.config.EMBEDDING_DIM)
                    elif len(vec) < self.config.EMBEDDING_DIM:
                        dense_clean.append(vec + [0.0]*(self.config.EMBEDDING_DIM - len(vec)))
                    else:
                        dense_clean.append(vec[:self.config.EMBEDDING_DIM])

                # TF-IDF ile sparse embedding üret
                from sklearn.feature_extraction.text import TfidfVectorizer
                vectorizer = TfidfVectorizer(max_features=5000)
                X_sparse = vectorizer.fit_transform(batch_texts)
                sparse_vectors = []
                for row in X_sparse:
                    row_coo = row.tocoo()
                    sparse_vectors.append({"indices": row_coo.col.tolist(), "values": row_coo.data.tolist()})

                # Listeye ekle
                all_embeddings_dense.extend(dense_clean)
                all_embeddings_sparse.extend(sparse_vectors)

                print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"❌ Embedding hatası (batch {i//batch_size+1}): {e}")
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([{"indices": [], "values": []} for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse



    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"📄 CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"📊 {len(df)} satır yüklendi")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ✅ İşlenen satır: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        print(f"🚀 {len(chunks)} chunk Qdrant'a yükleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings_bge(texts)

        points = []
        
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "dense_vec": d,
                    "sparse_vec": SparseVector(
                        indices=s["indices"],
                        values=s["values"]
                    )
                },
                payload=c,
            ))


        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ✅ Batch yüklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print("🎉 Yükleme tamamlandı!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """
        Dense-only semantic search
        """
        try:
            emb_res = self.bge_model.encode([query])
            q_dense = emb_res['dense_vecs'] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res

            # Tensor -> first 512 dims -> list
            q_t = torch.tensor(q_dense, dtype=torch.float32, device=self.config.DEVICE)
            q_sliced = q_t[0, :self.config.EMBEDDING_DIM]
            query_v = NamedVector(
                name="dense_vec",
                vector=q_sliced.cpu().tolist()
            )

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_v,
                limit=limit,
                with_payload=True,
                #vector_name="dense_vec",
                score_threshold=score_threshold
            )

            results = [{"score": p.score, "payload": p.payload} for p in qr]
            print(f"📊 {len(results)} sonuç bulundu (Dense only)")
            return results

        except Exception as e:
            print(f"❌ Semantic search hatası: {e}")
            return []

   # from qdrant_client.models import NamedVector, NamedSparseVector, SparseVector, SearchRequest

    def search_hybrid(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """
        Dense + Sparse (TF-IDF) hybrid search
        """
        try:
            # --- Dense tarafı (BGE embeddings) ---
            emb_res = self.bge_model.encode(
                [query],
                return_dense=True,
                return_sparse=True
            )

            # Dense vektör
            q_dense = emb_res.get("dense_vecs", [[0.0]*self.config.EMBEDDING_DIM])[0]
            q_dense = q_dense[:self.config.EMBEDDING_DIM]  # boyut kırpma
            query_dense = NamedVector(
                name="dense_vec",
                vector=q_dense
            )

            # Sparse vektör (senin TF-IDF çıktın)
            query_sparse = None
            sparse_raw = emb_res.get("sparse_vecs", [None])[0]
            if sparse_raw and "indices" in sparse_raw and "values" in sparse_raw:
                query_sparse = NamedSparseVector(
                    name="sparse_vec",
                    vector=SparseVector(
                        indices=sparse_raw["indices"],
                        values=sparse_raw["values"]
                    )
                )

            # --- Search requests ---
            requests = [SearchRequest(vector=query_dense, limit=limit, with_payload=True, score_threshold=score_threshold)]
            if query_sparse:
                requests.append(SearchRequest(vector=query_sparse, limit=limit, score_threshold=score_threshold))

            qr = self.qdrant_client.search_batch(
                collection_name=self.config.COLLECTION_NAME,
                requests=requests,
            )

            # --- Sonuçları topla ---
            results = []
            for request_result in qr:  # her request_result: List[ScoredPoint]
                for scored_point in request_result:
                    results.append({
                        "score": scored_point.score,
                        "payload": scored_point.payload
                    })

            print(f"📊 {len(results)} sonuç bulundu (Dense + TF-IDF Sparse)")
            return results

        except Exception as e:
            print(f"❌ Hybrid search hatası: {e}")
            return []

    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = None):
        try:
            
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])

            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k,v in filters.items()]
                query_filter = Filter(must=conditions)

            qr = self.qdrant_client.search(  #hibrit armada search_batch olucak bura
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=query_filter,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"📊 {len(results)} filtreli sonuç bulundu")
            return results
        except Exception as e:
            print(f"❌ Filtreli arama hatası: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Full pipeline başlıyor")
        emb_dim = self.processor.test_bge_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ Chunk bulunamadı")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\n🔎 İnteraktif arama başlatıldı")
        while True:
            print("\n1) Basit arama\n2) Filtreli arama\n3) Ana menü")
            ch = input("Seçiminiz (1-3): ").strip()
            if ch=="3":
                break
            if ch not in {"1","2"}:
                print("❌ Geçersiz seçim")
                continue
            q = input("🔍 Arama metni (çıkmak için 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
            try:
                limit = int(input("Kaç sonuç? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                results = self.processor.search_semantic(q, limit=limit)
            else:
                daire = input("Daire filtresi (örn: '6.HukukDairesi', boş = none): ").strip()
                filters = {'daire': daire} if daire else None
                results = self.processor.search_hybrid(q,  limit=limit)
                print(f"SONUÇLAR:{results}")
            if not results:
                print("❌ Sonuç bulunamadı")
                continue

            print(f"\n📋 {len(results)} sonuç:")
            for i, r in enumerate(results, 1):
                p = r.get("payload") or {}   # None ise boş dict döner
                score = r.get("score", 0.0)
                print(f"\n{i}. Skor: {score:.4f}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text','')) > 300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)


def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_hybrid_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)")
        print("="*60)
        print("1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)")
        print("2) İnteraktif arama")
        print("3) Koleksiyon bilgilerini göster")
        print("4) Çıkış")
        choice = input("Seçiminiz (1-4): ").strip()
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("✅ Tamamlandı" if ok else "❌ Hata çıktı")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            print("👋 Görüşürüz")
            break
        else:
            print("❌ Geçersiz seçim")

if __name__=="__main__":
    
    main()

# -

  from .autonotebook import tqdm as notebook_tqdm


True
🔮 BGE-M3 yükleniyor: BAAI/bge-m3 (device=cuda)


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 277156.65it/s]


✅ Hazır - Cihaz: NVIDIA RTX A6000

🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)
1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)
2) İnteraktif arama
3) Koleksiyon bilgilerini göster
4) Çıkış
🚀 Full pipeline başlıyor


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Dense embedding boyutu: 1024
🔍 Sparse embedding mevcut: True
🗑️ Eski koleksiyon silindi: bge_hybrid_chunks
✅ Koleksiyon oluşturuldu: bge_hybrid_chunks (Dense+Sparse)
📄 CSV okunuyor: /home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv
📊 10 satır yüklendi
  ✅ İşlenen satır: 5/10 (Toplam chunk: 44)
  ✅ İşlenen satır: 10/10 (Toplam chunk: 59)
🧩 Toplam 59 chunk oluşturuldu
🚀 59 chunk Qdrant'a yükleniyor...
🔮 59 metin işleniyor (batch_size=100)...
  📊 Batch işlendi: 59/59
  ✅ Batch yüklendi: 59/59
🎉 Yükleme tamamlandı!

📊 Koleksiyon Bilgileri:
{
  "collection_name": "bge_hybrid_chunks",
  "points_count": 59,
  "vectors_count": null,
  "status": "green",
  "embedding_model": "BGE-M3",
  "embedding_dim": 512
}
✅ Tamamlandı

🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)
1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)
2) İnteraktif arama
3) Koleksiyon bilgilerini göster
4) Çıkış

🔎 İnteraktif arama başlatıldı

1) Basit arama
2) Filtreli arama
3) Ana menü
📊 5 sonuç bulundu (Den

  qr = self.qdrant_client.search(


📊 5 sonuç bulundu (Dense + TF-IDF Sparse)
SONUÇLAR:[{'score': 0.673231, 'payload': {'chunk_id': 1, 'text': 'maddesi, ihtiyati tedbir kararının haksız olduğunun belirlenmesi halinde tedbir kararı yüzünden uğranılan zararın tazminini düzenlediğini, ihtiyati tedbir kararını icra ettiren tarafın yasal sürede dava açmaması halinde ihtiyati tedbirin haksız konulduğunun kabulü gerektiği, kaldı ki süresinde dava açsa da durumun değişmeyeceğini belirterek müvekkillerinin inşaatının geç bitirilmesinden kaynaklı 10.000,00 TL maddi tazminatın tahsiline karar verilmesini talep etmiştir. 2.Davacı vekili duruşmadaki beyanında; tedbir sebebiyle bağımsız bölümlerinin geç teslim edileceğini, bundan kaynaklı doğacak zararları talep ettiklerini beyan etmiştir. II. CEVAP Davalı vekili cevap dilekçesinde özetle; ekonomik nedenlerle 10 gün içerisinde dava açamadıklarını, 19 gün sonra açtıkları davanın ... 4. Asliye Hukuk Mahkemesinin 2010/27 Esas sayılı sırasında kayıtlı olduğunu belirterek davanın reddini t

  qr = self.qdrant_client.search_batch(



🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)
1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)
2) İnteraktif arama
3) Koleksiyon bilgilerini göster
4) Çıkış
👋 Görüşürüz


### ayrı

In [None]:
# main.py
# SemChunk + BGE-M3 + Qdrant Entegrasyon (Dense + Sparse, 512 dim slice, L2 normalize, hibrit search)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from qdrant_client.models import NamedVector, NamedSparseVector, SparseVectorParams, SparseVector
print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# -

In [None]:

def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256


In [None]:
class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # Model
        print(f"🔮 BGE-M3 yükleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"✅ Hazır - Cihaz: {device_name}")

    # Test connection & print dense+sparse
    def test_bge_connection(self):
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            emb_res = self.bge_model.encode(test_text)
            dense = emb_res['dense_vecs'][0] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res[0]
            sparse_available = 'colbert_vecs' in emb_res
            print(f"✅ Dense embedding boyutu: {len(dense)}")
            print(f"🔍 Sparse embedding mevcut: {sparse_available}")
            return len(dense)
        except Exception as e:
            print(f"❌ BGE-M3 bağlantı hatası: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME

        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        existing = [c.name for c in self.qdrant_client.get_collections().collections]
        if collection_name not in existing:
            try:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config={
                        "dense_vec": VectorParams(
                            size=self.config.EMBEDDING_DIM,
                            distance=Distance.COSINE
                        )
                    },
                    sparse_vectors_config={
                        "sparse_vec": SparseVectorParams(
                            index={"on_disk": False}  # Hibrid search için gerekli
                        )
                    }
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} (Dense+Sparse)")
            except Exception as e:
                print(f"❌ Koleksiyon oluşturma hatası: {e}")
                raise
        else:
            print(f"ℹ️ Koleksiyon zaten var: {collection_name}")


    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"❌ Chunking hatası: {e}")
            return []

    def create_embeddings_bge(self, texts: List[str], batch_size: int = None):
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"🔮 {total} metin işleniyor (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                emb_res = self.bge_model.encode(
                    batch_texts,
                    return_dense=True,
                    return_sparse=True
                )

                dense = emb_res["dense_vecs"]
                sparse = emb_res["sparse_vecs"]   # burada dict listesi geliyor: [{"indices": [...], "values": [...]}, ...]

                # Dense için normalize
                dense_t = torch.tensor(dense, dtype=torch.float32, device=self.config.DEVICE)
                with torch.no_grad():
                    dense_slice = dense_t[:, :self.config.EMBEDDING_DIM]
                    dense_norm = l2_normalize_tensor(dense_slice)

                all_embeddings_dense.extend([v.cpu().tolist() for v in dense_norm])
                all_embeddings_sparse.extend(sparse)

                print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"❌ Embedding hatası (batch {i//batch_size+1}): {e}")
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([{"indices": [], "values": []} for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse


    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"📄 CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"📊 {len(df)} satır yüklendi")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ✅ İşlenen satır: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        print(f"🚀 {len(chunks)} chunk Qdrant'a yükleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings_bge(texts)

        points = []
        
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "dense_vec": d,
                    "sparse_vec": SparseVector(
                        indices=s["indices"],
                        values=s["values"]
                    )
                },
                payload=c,
            ))


        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ✅ Batch yüklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print("🎉 Yükleme tamamlandı!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])

            # Dense
            dense_q = emb_res.get('dense_vecs', emb_res)
            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
            query_vector_dense = NamedVector(
                name="dense_vec",
                vector=dense_norm[0].cpu().tolist()
            )

            # Sparse
            colbert_vec = emb_res.get("colbert_vecs", [None])[0]
            if colbert_vec is None:
                query_vector_sparse = None
            else:
                indices = list(colbert_vec.keys())
                values = list(colbert_vec.values())
                query_vector_sparse = NamedSparseVector(
                    name="sparse_vec",
                    vector=SparseVector(indices=indices, values=values)
                )

            # Tek sorguda hem dense hem sparse
            query_vectors = [query_vector_dense]
            if query_vector_sparse:
                query_vectors.append(query_vector_sparse)

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vectors,
                query_filter=None,
                limit=limit,
                with_payload=True,
                with_vectors=False,
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"📊 {len(results)} sonuç bulundu")
            return results
        except Exception as e:
            print(f"❌ Arama hatası: {e}")
            return []


    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])

            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k,v in filters.items()]
                query_filter = Filter(must=conditions)

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=query_filter,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"📊 {len(results)} filtreli sonuç bulundu")
            return results
        except Exception as e:
            print(f"❌ Filtreli arama hatası: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}



In [None]:

class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Full pipeline başlıyor")
        emb_dim = self.processor.test_bge_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ Chunk bulunamadı")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\n🔎 İnteraktif arama başlatıldı")
        while True:
            print("\n1) Basit arama\n2) Filtreli arama\n3) Ana menü")
            ch = input("Seçiminiz (1-3): ").strip()
            if ch=="3":
                break
            if ch not in {"1","2"}:
                print("❌ Geçersiz seçim")
                continue
            q = input("🔍 Arama metni (çıkmak için 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
            try:
                limit = int(input("Kaç sonuç? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                results = self.processor.search_semantic(q, limit=limit)
            else:
                daire = input("Daire filtresi (örn: '6.HukukDairesi', boş = none): ").strip()
                filters = {'daire': daire} if daire else None
                results = self.processor.advanced_search_with_filters(q, filters=filters, limit=limit)

            if not results:
                print("❌ Sonuç bulunamadı")
                continue

            print(f"\n📋 {len(results)} sonuç:")
            for i,r in enumerate(results,1):
                p=r['payload']
                print(f"\n{i}. Skor: {r['score']:.4f}")
                print(f"   Esas No: {p.get('esas_no','N/A')} | Karar No: {p.get('karar_no','N/A')}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text',''))>300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)



In [None]:
def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_hybrid_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)")
        print("="*60)
        print("1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)")
        print("2) İnteraktif arama")
        print("3) Koleksiyon bilgilerini göster")
        print("4) Çıkış")
        choice = input("Seçiminiz (1-4): ").strip()
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("✅ Tamamlandı" if ok else "❌ Hata çıktı")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            print("👋 Görüşürüz")
            break
        else:
            print("❌ Geçersiz seçim")

if __name__=="__main__":
    
    main()


### BOZUK

In [None]:
# main.py
# BGE-M3 + Qdrant Hybrid Search (Dense + Sparse Vectors)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import (
    VectorParams, Distance, PointStruct, 
    SparseVectorParams, SparseIndexParams,
    NamedVector,
    Filter, FieldCondition, MatchValue
)
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Any, Tuple
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# -------------------------
# Helper: normalize tensor rows (L2)
# -------------------------
def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    """L2 normalize tensor for cosine similarity"""
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

def convert_sparse_to_qdrant_format(sparse_vecs: List[Dict]) -> List[Dict]:
    """Convert BGE-M3 sparse format to Qdrant sparse format"""
    result = []
    for sparse_vec in sparse_vecs:
        if isinstance(sparse_vec, dict):
            indices = list(sparse_vec.keys())
            values = list(sparse_vec.values())
        else:
            # If it's already in indices/values format
            indices = sparse_vec.get('indices', [])
            values = sparse_vec.get('values', [])
        
        result.append({
            'indices': [int(idx) for idx in indices],
            'values': [float(val) for val in values]
        })
    return result

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_hybrid_search"
    DENSE_DIM: int = 512  # BGE-M3 dense vector dimension
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 32
    DB_BATCH: int = 64
    # Hybrid search weights
    DENSE_WEIGHT: float = 0.7
    SPARSE_WEIGHT: float = 0.3

class YargitayHybridProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # BGE-M3 Model
        print(f"🔮 BGE-M3 yükleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(
            config.BGE_MODEL_NAME, 
            use_fp16=config.USE_FP16, 
            device=config.DEVICE
        )

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"✅ Hazır - Cihaz: {device_name}")

    def test_bge_connection(self) -> bool:
        """Test BGE-M3 connection and show embedding dimensions"""
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            embeddings = self.bge_model.encode(
                test_text,
                return_dense=True,
                return_sparse=True,
                return_colbert_vecs=False
            )
            
            dense = embeddings['dense_vecs']
            sparse = embeddings['lexical_weights']
            
            print(f"✅ BGE-M3 test başarılı")
            print(f"📊 Dense embedding boyutu: {len(dense[0])}")
            print(f"📊 Sparse embedding token sayısı: {len(sparse[0])}")
            print(f"🔍 Dense sample: {dense[0][:5]}...")
            print(f"🔍 Sparse sample keys: {list(sparse[0].keys())[:5]}...")
            
            return True
        except Exception as e:
            print(f"❌ BGE-M3 bağlantı hatası: {e}")
            return False

    def create_qdrant_collection(self, recreate: bool = False):
        """Create Qdrant collection with hybrid vector support"""
        collection_name = self.config.COLLECTION_NAME
        
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Create collection with named vectors (dense + sparse)
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config={
                        "dense": VectorParams(
                            size=self.config.DENSE_DIM,
                            distance=Distance.COSINE
                        )
                    },
                    sparse_vectors_config={
                        "sparse": SparseVectorParams(
                            index=SparseIndexParams()
                        )
                    }
                )
                print(f"✅ Hybrid koleksiyon oluşturuldu: {collection_name}")
                print(f"   Dense boyut: {self.config.DENSE_DIM}")
                print(f"   Sparse: Aktif")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        """Chunk text using semantic chunking"""
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"❌ Chunking hatası: {e}")
            return []

    def create_hybrid_embeddings(self, texts: List[str], batch_size: int = None) -> Tuple[List[List[float]], List[Dict]]:
        """Create both dense and sparse embeddings using BGE-M3"""
        batch_size = batch_size or self.config.BATCH_SIZE
        all_dense: List[List[float]] = []
        all_sparse: List[Dict] = []
        total = len(texts)
        
        print(f"🔮 BGE-M3 ile hybrid embedding oluşturuluyor: {total} metin (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                # Get both dense and sparse embeddings
                embeddings = self.bge_model.encode(
                    batch_texts,
                    return_dense=True,
                    return_sparse=True,
                    return_colbert_vecs=False
                )
                
                dense_vecs = embeddings['dense_vecs']
                sparse_vecs = embeddings['lexical_weights']
                
                # Process dense vectors (normalize)
                if not isinstance(dense_vecs, torch.Tensor):
                    dense_t = torch.tensor(dense_vecs, dtype=torch.float32, device=self.config.DEVICE)
                else:
                    dense_t = dense_vecs.to(self.config.DEVICE)
                
                with torch.no_grad():
                    normed_dense = l2_normalize_tensor(dense_t)
                
                # Convert to lists
                batch_dense = [v.cpu().tolist() for v in normed_dense]
                all_dense.extend(batch_dense)
                
                # Process sparse vectors
                batch_sparse = convert_sparse_to_qdrant_format(sparse_vecs)
                all_sparse.extend(batch_sparse)

                print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

            except Exception as e:
                print(f"❌ Hybrid embedding hatası (batch {i//batch_size + 1}): {e}")
                # Fallback zero vectors
                fallback_dense = [[0.0] * self.config.DENSE_DIM for _ in batch_texts]
                fallback_sparse = [{'indices': [], 'values': []} for _ in batch_texts]
                all_dense.extend(fallback_dense)
                all_sparse.extend(fallback_sparse)

        print(f"✅ Hybrid embeddings oluşturuldu: {len(all_dense)} dense, {len(all_sparse)} sparse")
        
        # Debug: verify dense vector dimensions
        if all_dense:
            sample_dense_dim = len(all_dense[0])
            print(f"🔍 Dense vector boyutu kontrolü: {sample_dense_dim} (hedef: {self.config.DENSE_DIM})")
            if sample_dense_dim != self.config.DENSE_DIM:
                print(f"❌ Boyut uyumsuzluğu tespit edildi!")
        
        return all_dense, all_sparse

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        """Process CSV file and create chunks"""
        print(f"📄 CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"📊 {len(df)} satır yüklendi")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            
            if (idx + 1) % 5 == 0:
                print(f"  ✅ İşlenen satır: {idx + 1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        """Upload chunks with hybrid embeddings to Qdrant"""
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        print(f"🚀 {len(chunks)} chunk hybrid embedding ile Qdrant'a yükleniyor...")
        texts = [c['text'] for c in chunks]
        dense_embeddings, sparse_embeddings = self.create_hybrid_embeddings(texts)

        if len(dense_embeddings) != len(chunks) or len(sparse_embeddings) != len(chunks):
            print(f"❌ Embedding sayısı uyumsuz")
            return

        points = []
        for i, (chunk, dense, sparse) in enumerate(zip(chunks, dense_embeddings, sparse_embeddings)):
            # Create vectors dictionary with named vectors
            vectors = {"dense": dense}
            
            # Only add sparse if it has data
            if sparse['indices'] and sparse['values']:
                vectors["sparse"] = {
                    "indices": sparse['indices'],
                    "values": sparse['values']
                }
            
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=vectors,
                payload=chunk
            )
            points.append(point)

        # Upload in batches
        batch_size = self.config.DB_BATCH
        for i in range(0, len(points), batch_size):
            try:
                batch_points = points[i:i+batch_size]
                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch_points
                )
                print(f"  ✅ Batch yüklendi: {min(i+batch_size, len(points))}/{len(points)}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print("🎉 Hybrid yükleme tamamlandı!")

    from qdrant_client.models import SearchRequest, NamedVector, NamedSparseVector, SparseVector

def hybrid_search(self, query: str, limit: int = 10, score_threshold: float = None, 
                  dense_weight: float = None, sparse_weight: float = None) -> List[Dict]:
    
    try:
        # Kullanılacak ağırlıkları al
        dense_weight = dense_weight or self.config.DENSE_WEIGHT
        sparse_weight = sparse_weight or self.config.SPARSE_WEIGHT

        print(f"🔍 Hybrid arama: dense_weight={dense_weight}, sparse_weight={sparse_weight}")

        # Query embeddings
        embeddings = self.bge_model.encode(
            [query],
            return_dense=True,
            return_sparse=True,
            return_colbert_vecs=False
        )
        query_dense = embeddings['dense_vecs'][0]
        query_sparse = embeddings['lexical_weights'][0]

        # Dense vector normalize
        query_dense_t = torch.tensor(query_dense, dtype=torch.float32, device=self.config.DEVICE)
        with torch.no_grad():
            query_dense_norm = l2_normalize_tensor(query_dense_t).cpu().tolist()

        # Sparse vector Qdrant format
        query_sparse_qdrant = SparseVector(
            indices=[int(idx) for idx in query_sparse.keys()],
            values=[float(val) for val in query_sparse.values()]
        )

        # Search batch
        search_requests = [
            SearchRequest(
                vector=NamedVector(
                    name="dense",
                    vector=query_dense_norm
                ),
                limit=limit * 2,
                score_threshold=score_threshold,
                with_payload=True
            ),
            SearchRequest(
                vector=NamedSparseVector(
                    name="sparse",
                    vector=query_sparse_qdrant
                ),
                limit=limit * 2,
                score_threshold=score_threshold,
                with_payload=True
            )
        ]

        results = self.qdrant_client.search_batch(
            collection_name=self.config.COLLECTION_NAME,
            requests=search_requests
        )

        dense_results = results[0]
        sparse_results = results[1]

        # Combine results with weights
        combined_scores = {}
        for r in dense_results:
            combined_scores[r.id] = {
                'dense_score': r.score * dense_weight,
                'sparse_score': 0,
                'payload': r.payload
            }
        for r in sparse_results:
            if r.id in combined_scores:
                combined_scores[r.id]['sparse_score'] = r.score * sparse_weight
            else:
                combined_scores[r.id] = {
                    'dense_score': 0,
                    'sparse_score': r.score * sparse_weight,
                    'payload': r.payload
                }

        final_results = []
        for point_id, scores in combined_scores.items():
            final_score = scores['dense_score'] + scores['sparse_score']
            final_results.append({
                'score': final_score,
                'dense_score': scores['dense_score'],
                'sparse_score': scores['sparse_score'],
                'payload': scores['payload']
            })

        # Sort by final score and limit
        final_results.sort(key=lambda x: x['score'], reverse=True)
        return final_results[:limit]

    except Exception as e:
        print(f"❌ Hybrid arama hatası: {e}")
        import traceback
        traceback.print_exc()
        return []

    def search_dense_only(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Search using only dense vectors"""
        try:
            query_embeddings = self.bge_model.encode([query], return_dense=True, return_sparse=False)
            query_dense = query_embeddings['dense_vecs'][0]
            
            # Normalize
            query_dense_t = torch.tensor(query_dense, dtype=torch.float32, device=self.config.DEVICE)
            with torch.no_grad():
                query_dense_norm = l2_normalize_tensor(query_dense_t).cpu().tolist()
            
            results = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=("dense", query_dense_norm),
                limit=limit,
                score_threshold=score_threshold
            )
            
            return [{'score': r.score, 'payload': r.payload} for r in results]
            
        except Exception as e:
            print(f"❌ Dense arama hatası: {e}")
            return []

    def search_sparse_only(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Search using only sparse vectors"""
        try:
            query_embeddings = self.bge_model.encode([query], return_dense=False, return_sparse=True)
            query_sparse = query_embeddings['lexical_weights'][0]
            
            query_sparse_qdrant = {
                "name": "sparse",
                "indices": [int(idx) for idx in query_sparse.keys()],
                "values": [float(val) for val in query_sparse.values()]
            }
            
            results = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_sparse_qdrant,
                limit=limit,
                score_threshold=score_threshold
            )
            
            return [{'score': r.score, 'payload': r.payload} for r in results]
            
        except Exception as e:
            print(f"❌ Sparse arama hatası: {e}")
            return []

    def get_collection_info(self) -> dict:
        """Get collection information"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3 Hybrid",
                "dense_dim": self.config.DENSE_DIM,
                "sparse_enabled": True,
                "dense_weight": self.config.DENSE_WEIGHT,
                "sparse_weight": self.config.SPARSE_WEIGHT
            }
        except Exception as e:
            return {"error": str(e)}

# -------------------------
# Pipeline
# -------------------------
class YargitayHybridPipeline:
    def __init__(self, config: Config):
        self.processor = YargitayHybridProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        """Run full pipeline: CSV -> chunks -> hybrid embeddings -> Qdrant"""
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Hybrid pipeline başlıyor")
        
        if not self.processor.test_bge_connection():
            return False
        
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        
        if not chunks:
            print("❌ Chunk bulunamadı")
            return False
        
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        """Interactive search interface"""
        print("\n🔎 İnteraktif hybrid arama başlatıldı")
        
        while True:
            print("\n" + "="*50)
            print("1) Hybrid arama (Dense + Sparse)")
            print("2) Sadece Dense arama")
            print("3) Sadece Sparse arama")
            print("4) Hybrid ağırlık ayarları")
            print("5) Ana menü")
            
            choice = input("Seçiminiz (1-5): ").strip()
            
            if choice == "5":
                break
            
            if choice == "4":
                try:
                    dense_w = float(input(f"Dense ağırlık (mevcut: {self.config.DENSE_WEIGHT}): ") or self.config.DENSE_WEIGHT)
                    sparse_w = float(input(f"Sparse ağırlık (mevcut: {self.config.SPARSE_WEIGHT}): ") or self.config.SPARSE_WEIGHT)
                    self.config.DENSE_WEIGHT = dense_w
                    self.config.SPARSE_WEIGHT = sparse_w
                    print(f"✅ Ağırlıklar güncellendi: Dense={dense_w}, Sparse={sparse_w}")
                except ValueError:
                    print("❌ Geçersiz değer")
                continue
            
            if choice not in {"1", "2", "3"}:
                print("❌ Geçersiz seçim")
                continue
            
            query = input("🔍 Arama metni (çıkmak için 'q'): ").strip()
            if query.lower() in {'q', 'quit', 'exit'}:
                break
            
            if not query:
                continue
            
            try:
                limit = int(input("Kaç sonuç? (default 5): ") or 5)
            except:
                limit = 5
            
            # Perform search based on choice
            if choice == "1":
                results = self.processor.hybrid_search(query, limit=limit)
            elif choice == "2":
                results = self.processor.search_dense_only(query, limit=limit)
            elif choice == "3":
                results = self.processor.search_sparse_only(query, limit=limit)
            
            if not results:
                print("❌ Sonuç bulunamadı")
                continue

            print(f"\n📋 {len(results)} sonuç ({['Hybrid', 'Dense Only', 'Sparse Only'][int(choice)-1]} arama):")
            
            for i, r in enumerate(results, 1):
                p = r['payload']
                print(f"\n{i}. Skor: {r['score']:.4f}")
                
                # Show component scores for hybrid search
                if choice == "1" and 'dense_score' in r and 'sparse_score' in r:
                    print(f"   (Dense: {r['dense_score']:.4f}, Sparse: {r['sparse_score']:.4f})")
                
                print(f"   Esas No: {p.get('esas_no','N/A')} | Karar No: {p.get('karar_no','N/A')}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text','')) > 300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)

def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="yargitay_hybrid_search",
        DENSE_DIM=512,  # 512-dimensional dense vectors
        BATCH_SIZE=32,
        DB_BATCH=64,
        DENSE_WEIGHT=0.7,
        SPARSE_WEIGHT=0.3
    )

    pipeline = YargitayHybridPipeline(config)

    while True:
        print("\n" + "="*60)
        print("🏛️ YARGITAY BGE-M3 HYBRID SEARCH SİSTEMİ")
        print("="*60)
        print("1) Tam pipeline çalıştır (CSV -> chunks -> hybrid embed -> qdrant)")
        print("2) İnteraktif arama")
        print("3) Koleksiyon bilgilerini göster")
        print("4) Çıkış")
        
        choice = input("Seçiminiz (1-4): ").strip()
        
        if choice == "1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            success = pipeline.full_pipeline(csv_path)
            print("✅ Tamamlandı" if success else "❌ Hata oluştu")
        
        elif choice == "2":
            pipeline.interactive_search()
        
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        
        elif choice == "4":
            print("👋 Görüşürüz")
            break
        
        else:
            print("❌ Geçersiz seçim")

if __name__ == "__main__":
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("✅ FlagEmbedding yüklü")
    except ImportError:
        print("❌ FlagEmbedding bulunamadı — pip install FlagEmbedding")
        raise SystemExit(1)
    
    main()

### HİBRİT

In [None]:
# main.py
# SemChunk + BGE-M3 + Qdrant Entegrasyon (Dense + Sparse, 512 dim slice, L2 normalize, hibrit search)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# -------------------------
# Helper: normalize tensor rows (L2)
# -------------------------
def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256

class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # Model
        print(f"🔮 BGE-M3 yükleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"✅ Hazır - Cihaz: {device_name}")

    # Test connection & print dense+sparse
    def test_bge_connection(self):
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            emb_res = self.bge_model.encode(test_text)
            dense = emb_res['dense_vecs'][0] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res[0]
            sparse_available = 'colbert_vecs' in emb_res
            print(f"✅ Dense embedding boyutu: {len(dense)}")
            print(f"🔍 Sparse embedding mevcut: {sparse_available}")
            return len(dense)
        except Exception as e:
            print(f"❌ BGE-M3 bağlantı hatası: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense + Sparse (sparse için yine 512 dim)
                vectors_config = {
                    "dense_vec": VectorParams(size=self.config.EMBEDDING_DIM, distance=Distance.COSINE),
                    "sparse_vec": VectorParams(size=self.config.EMBEDDING_DIM, distance=Distance.COSINE)
                }
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} (Dense+Sparse)")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"❌ Chunking hatası: {e}")
            return []

    def create_embeddings_bge(self, texts: List[str], batch_size: int = None):
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"🔮 {total} metin işleniyor (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                emb_res = self.bge_model.encode(batch_texts)

                dense = emb_res.get('dense_vecs', emb_res)
                sparse = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)]*len(batch_texts))

                dense_t = torch.tensor(dense, dtype=torch.float32, device=self.config.DEVICE)
                sparse_t = torch.tensor(sparse, dtype=torch.float32, device=self.config.DEVICE)

                with torch.no_grad():
                    dense_slice = dense_t[:, :self.config.EMBEDDING_DIM]
                    dense_norm = l2_normalize_tensor(dense_slice)
                    sparse_slice = sparse_t[:, :self.config.EMBEDDING_DIM]
                    sparse_norm = l2_normalize_tensor(sparse_slice)

                all_embeddings_dense.extend([v.cpu().tolist() for v in dense_norm])
                all_embeddings_sparse.extend([v.cpu().tolist() for v in sparse_norm])

                print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"❌ Embedding hatası (batch {i//batch_size+1}): {e}")
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"📄 CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"📊 {len(df)} satır yüklendi")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ✅ İşlenen satır: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        print(f"🚀 {len(chunks)} chunk Qdrant'a yükleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings_bge(texts)

        points = []
        
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            vectors={'dense_vec': d, 'sparse_vec': s} 
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector=vectors,
                payload=c,
            ))

        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ✅ Batch yüklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print("🎉 Yükleme tamamlandı!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            if dense_q is None or any(v is None for v in dense_q[0]):
                dense_q = [np.zeros(self.config.EMBEDDING_DIM)]

            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])
            if sparse_q is None or any(v is None for v in sparse_q[0]):
                sparse_q = [np.zeros(self.config.EMBEDDING_DIM)]


            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=None,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                # Sparse hibrit param Qdrant 1.2+
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"📊 {len(results)} sonuç bulundu")
            return results
        except Exception as e:
            print(f"❌ Arama hatası: {e}")
            return []

    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])

            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k,v in filters.items()]
                query_filter = Filter(must=conditions)

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=query_filter,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"📊 {len(results)} filtreli sonuç bulundu")
            return results
        except Exception as e:
            print(f"❌ Filtreli arama hatası: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

# -------------------------
# Pipeline
# -------------------------
class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Full pipeline başlıyor")
        emb_dim = self.processor.test_bge_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ Chunk bulunamadı")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\n🔎 İnteraktif arama başlatıldı")
        while True:
            print("\n1) Basit arama\n2) Filtreli arama\n3) Ana menü")
            ch = input("Seçiminiz (1-3): ").strip()
            if ch=="3":
                break
            if ch not in {"1","2"}:
                print("❌ Geçersiz seçim")
                continue
            q = input("🔍 Arama metni (çıkmak için 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
            try:
                limit = int(input("Kaç sonuç? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                results = self.processor.search_semantic(q, limit=limit)
            else:
                daire = input("Daire filtresi (örn: '6.HukukDairesi', boş = none): ").strip()
                filters = {'daire': daire} if daire else None
                results = self.processor.advanced_search_with_filters(q, filters=filters, limit=limit)

            if not results:
                print("❌ Sonuç bulunamadı")
                continue

            print(f"\n📋 {len(results)} sonuç:")
            for i,r in enumerate(results,1):
                p=r['payload']
                print(f"\n{i}. Skor: {r['score']:.4f}")
                print(f"   Esas No: {p.get('esas_no','N/A')} | Karar No: {p.get('karar_no','N/A')}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text',''))>300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)

def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_hybrid_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)")
        print("="*60)
        print("1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)")
        print("2) İnteraktif arama")
        print("3) Koleksiyon bilgilerini göster")
        print("4) Çıkış")
        choice = input("Seçiminiz (1-4): ").strip()
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("✅ Tamamlandı" if ok else "❌ Hata çıktı")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            print("👋 Görüşürüz")
            break
        else:
            print("❌ Geçersiz seçim")

if __name__=="__main__":
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("✅ FlagEmbedding yüklü")
    except ImportError:
        print("❌ FlagEmbedding bulunamadı — pip install FlagEmbedding")
        raise SystemExit(1)
    main()