### full kod

In [1]:
# main.py
# SemChunk + BGE-M3 + Qdrant Entegrasyon (Dense + Sparse, 512 dim slice, L2 normalize, hibrit search)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict
import os
from qdrant_client import models
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from typing import List, Dict
from qdrant_client.models import NamedVector, NamedSparseVector, SparseVectorParams, SparseVector
from sklearn.feature_extraction.text import TfidfVectorizer
from qdrant_client.http.models import NamedVector, NamedSparseVector, SparseVector, SearchRequest


print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256
class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # Model
        print(f"üîÆ BGE-M3 y√ºkleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"‚úÖ Hazƒ±r - Cihaz: {device_name}")

    # Test connection & print dense+sparse
    def test_bge_connection(self):
        try:
            test_text = ["Yargƒ±tay 6. Hukuk Dairesi'nin ihtiyati tedbir kararƒ±"]
            emb_res = self.bge_model.encode(test_text)
            dense = emb_res['dense_vecs'][0] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res[0]
            sparse_available = 'colbert_vecs' in emb_res
            print(f"‚úÖ Dense embedding boyutu: {len(dense)}")
            print(f"üîç Sparse embedding mevcut: {sparse_available}")
            return len(dense)
        except Exception as e:
            print(f"‚ùå BGE-M3 baƒülantƒ± hatasƒ±: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"üóëÔ∏è Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense + Sparse (sparse i√ßin yine 512 dim)
                vectors_config = {
                    "dense_vec": models.VectorParams(size=self.config.EMBEDDING_DIM, distance=models.Distance.COSINE),
                }
                sparse_config = {
                    "sparse_vec": models.SparseVectorParams(
                        index=models.SparseIndexParams(on_disk=False))
                }
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config,
                    sparse_vectors_config = sparse_config
                )
                print(f"‚úÖ Koleksiyon olu≈üturuldu: {collection_name} (Dense+Sparse)")
            else:
                print(f"‚ÑπÔ∏è Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"‚ùå Koleksiyon olu≈üturma hatasƒ±: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"‚ùå Chunking hatasƒ±: {e}")
            return []

    def create_embeddings_bge(self, texts: List[str], batch_size: int = None):
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"üîÆ {total} metin i≈üleniyor (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                # Model dense embedding √ºret
                emb_res = self.bge_model.encode(
                    batch_texts,
                    return_dense=True,
                    return_sparse=True
                )
                dense = emb_res.get("dense_vecs", [[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])

                # Dense i√ßinde None veya kƒ±sa vekt√∂r varsa d√ºzelt
                dense_clean = []
                for vec in dense:
                    if vec is None:
                        dense_clean.append([0.0]*self.config.EMBEDDING_DIM)
                    elif len(vec) < self.config.EMBEDDING_DIM:
                        dense_clean.append(vec + [0.0]*(self.config.EMBEDDING_DIM - len(vec)))
                    else:
                        dense_clean.append(vec[:self.config.EMBEDDING_DIM])

                # TF-IDF ile sparse embedding √ºret
                from sklearn.feature_extraction.text import TfidfVectorizer
                vectorizer = TfidfVectorizer(max_features=5000)
                X_sparse = vectorizer.fit_transform(batch_texts)
                sparse_vectors = []
                for row in X_sparse:
                    row_coo = row.tocoo()
                    sparse_vectors.append({"indices": row_coo.col.tolist(), "values": row_coo.data.tolist()})

                # Listeye ekle
                all_embeddings_dense.extend(dense_clean)
                all_embeddings_sparse.extend(sparse_vectors)

                print(f"  üìä Batch i≈ülendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"‚ùå Embedding hatasƒ± (batch {i//batch_size+1}): {e}")
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([{"indices": [], "values": []} for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse



    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"üìÑ CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"üìä {len(df)} satƒ±r y√ºklendi")
        except Exception as e:
            print(f"‚ùå CSV okuma hatasƒ±: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("‚ùå Ana metin s√ºtunu bulunamadƒ±")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ‚úÖ ƒ∞≈ülenen satƒ±r: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"üß© Toplam {len(all_chunks)} chunk olu≈üturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("‚ùå Y√ºklenecek chunk yok")
            return

        print(f"üöÄ {len(chunks)} chunk Qdrant'a y√ºkleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings_bge(texts)

        points = []
        
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "dense_vec": d,
                    "sparse_vec": SparseVector(
                        indices=s["indices"],
                        values=s["values"]
                    )
                },
                payload=c,
            ))


        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ‚úÖ Batch y√ºklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"‚ùå Batch y√ºkleme hatasƒ±: {e}")

        print("üéâ Y√ºkleme tamamlandƒ±!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """
        Dense-only semantic search
        """
        try:
            emb_res = self.bge_model.encode([query])
            q_dense = emb_res['dense_vecs'] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res

            # Tensor -> first 512 dims -> list
            q_t = torch.tensor(q_dense, dtype=torch.float32, device=self.config.DEVICE)
            q_sliced = q_t[0, :self.config.EMBEDDING_DIM]
            query_v = NamedVector(
                name="dense_vec",
                vector=q_sliced.cpu().tolist()
            )

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_v,
                limit=limit,
                with_payload=True,
                #vector_name="dense_vec",
                score_threshold=score_threshold
            )

            results = [{"score": p.score, "payload": p.payload} for p in qr]
            print(f"üìä {len(results)} sonu√ß bulundu (Dense only)")
            return results

        except Exception as e:
            print(f"‚ùå Semantic search hatasƒ±: {e}")
            return []

   # from qdrant_client.models import NamedVector, NamedSparseVector, SparseVector, SearchRequest

    def search_hybrid(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """
        Dense + Sparse (TF-IDF) hybrid search
        """
        try:
            # --- Dense tarafƒ± (BGE embeddings) ---
            emb_res = self.bge_model.encode(
                [query],
                return_dense=True,
                return_sparse=True
            )

            # Dense vekt√∂r
            q_dense = emb_res.get("dense_vecs", [[0.0]*self.config.EMBEDDING_DIM])[0]
            q_dense = q_dense[:self.config.EMBEDDING_DIM]  # boyut kƒ±rpma
            query_dense = NamedVector(
                name="dense_vec",
                vector=q_dense
            )

            # Sparse vekt√∂r (senin TF-IDF √ßƒ±ktƒ±n)
            query_sparse = None
            sparse_raw = emb_res.get("sparse_vecs", [None])[0]
            if sparse_raw and "indices" in sparse_raw and "values" in sparse_raw:
                query_sparse = NamedSparseVector(
                    name="sparse_vec",
                    vector=SparseVector(
                        indices=sparse_raw["indices"],
                        values=sparse_raw["values"]
                    )
                )

            # --- Search requests ---
            requests = [SearchRequest(vector=query_dense, limit=limit, with_payload=True, score_threshold=score_threshold)]
            if query_sparse:
                requests.append(SearchRequest(vector=query_sparse, limit=limit, score_threshold=score_threshold))

            qr = self.qdrant_client.search_batch(
                collection_name=self.config.COLLECTION_NAME,
                requests=requests,
            )

            # --- Sonu√ßlarƒ± topla ---
            results = []
            for request_result in qr:  # her request_result: List[ScoredPoint]
                for scored_point in request_result:
                    results.append({
                        "score": scored_point.score,
                        "payload": scored_point.payload
                    })

            print(f"üìä {len(results)} sonu√ß bulundu (Dense + TF-IDF Sparse)")
            return results

        except Exception as e:
            print(f"‚ùå Hybrid search hatasƒ±: {e}")
            return []

    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = None):
        try:
            
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])

            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k,v in filters.items()]
                query_filter = Filter(must=conditions)

            qr = self.qdrant_client.search(  #hibrit armada search_batch olucak bura
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=query_filter,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"üìä {len(results)} filtreli sonu√ß bulundu")
            return results
        except Exception as e:
            print(f"‚ùå Filtreli arama hatasƒ±: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("üöÄ Full pipeline ba≈ülƒ±yor")
        emb_dim = self.processor.test_bge_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("‚ùå Chunk bulunamadƒ±")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\nüìä Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\nüîé ƒ∞nteraktif arama ba≈ülatƒ±ldƒ±")
        while True:
            print("\n1) Basit arama\n2) Filtreli arama\n3) Ana men√º")
            ch = input("Se√ßiminiz (1-3): ").strip()
            if ch=="3":
                break
            if ch not in {"1","2"}:
                print("‚ùå Ge√ßersiz se√ßim")
                continue
            q = input("üîç Arama metni (√ßƒ±kmak i√ßin 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
            try:
                limit = int(input("Ka√ß sonu√ß? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                results = self.processor.search_semantic(q, limit=limit)
            else:
                daire = input("Daire filtresi (√∂rn: '6.HukukDairesi', bo≈ü = none): ").strip()
                filters = {'daire': daire} if daire else None
                results = self.processor.search_hybrid(q,  limit=limit)
                print(f"SONU√áLAR:{results}")
            if not results:
                print("‚ùå Sonu√ß bulunamadƒ±")
                continue

            print(f"\nüìã {len(results)} sonu√ß:")
            for i, r in enumerate(results, 1):
                p = r.get("payload") or {}   # None ise bo≈ü dict d√∂ner
                score = r.get("score", 0.0)
                print(f"\n{i}. Skor: {score:.4f}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text','')) > 300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)


def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_hybrid_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K Sƒ∞STEM (Dense+Sparse)")
        print("="*60)
        print("1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)")
        print("2) ƒ∞nteraktif arama")
        print("3) Koleksiyon bilgilerini g√∂ster")
        print("4) √áƒ±kƒ±≈ü")
        choice = input("Se√ßiminiz (1-4): ").strip()
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("‚úÖ Tamamlandƒ±" if ok else "‚ùå Hata √ßƒ±ktƒ±")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            print("üëã G√∂r√º≈ü√ºr√ºz")
            break
        else:
            print("‚ùå Ge√ßersiz se√ßim")

if __name__=="__main__":
    
    main()

# -

  from .autonotebook import tqdm as notebook_tqdm


True
üîÆ BGE-M3 y√ºkleniyor: BAAI/bge-m3 (device=cuda)


Fetching 30 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:00<00:00, 277156.65it/s]


‚úÖ Hazƒ±r - Cihaz: NVIDIA RTX A6000

üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K Sƒ∞STEM (Dense+Sparse)
1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)
2) ƒ∞nteraktif arama
3) Koleksiyon bilgilerini g√∂ster
4) √áƒ±kƒ±≈ü
üöÄ Full pipeline ba≈ülƒ±yor


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


‚úÖ Dense embedding boyutu: 1024
üîç Sparse embedding mevcut: True
üóëÔ∏è Eski koleksiyon silindi: bge_hybrid_chunks
‚úÖ Koleksiyon olu≈üturuldu: bge_hybrid_chunks (Dense+Sparse)
üìÑ CSV okunuyor: /home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv
üìä 10 satƒ±r y√ºklendi
  ‚úÖ ƒ∞≈ülenen satƒ±r: 5/10 (Toplam chunk: 44)
  ‚úÖ ƒ∞≈ülenen satƒ±r: 10/10 (Toplam chunk: 59)
üß© Toplam 59 chunk olu≈üturuldu
üöÄ 59 chunk Qdrant'a y√ºkleniyor...
üîÆ 59 metin i≈üleniyor (batch_size=100)...
  üìä Batch i≈ülendi: 59/59
  ‚úÖ Batch y√ºklendi: 59/59
üéâ Y√ºkleme tamamlandƒ±!

üìä Koleksiyon Bilgileri:
{
  "collection_name": "bge_hybrid_chunks",
  "points_count": 59,
  "vectors_count": null,
  "status": "green",
  "embedding_model": "BGE-M3",
  "embedding_dim": 512
}
‚úÖ Tamamlandƒ±

üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K Sƒ∞STEM (Dense+Sparse)
1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)
2) ƒ∞nteraktif arama
3) Koleksiyon bilgilerini g√∂ster
4) √áƒ±kƒ±≈ü

üîé ƒ∞nteraktif ara

  qr = self.qdrant_client.search(


üìä 5 sonu√ß bulundu (Dense + TF-IDF Sparse)
SONU√áLAR:[{'score': 0.673231, 'payload': {'chunk_id': 1, 'text': 'maddesi, ihtiyati tedbir kararƒ±nƒ±n haksƒ±z olduƒüunun belirlenmesi halinde tedbir kararƒ± y√ºz√ºnden uƒüranƒ±lan zararƒ±n tazminini d√ºzenlediƒüini, ihtiyati tedbir kararƒ±nƒ± icra ettiren tarafƒ±n yasal s√ºrede dava a√ßmamasƒ± halinde ihtiyati tedbirin haksƒ±z konulduƒüunun kabul√º gerektiƒüi, kaldƒ± ki s√ºresinde dava a√ßsa da durumun deƒüi≈ümeyeceƒüini belirterek m√ºvekkillerinin in≈üaatƒ±nƒ±n ge√ß bitirilmesinden kaynaklƒ± 10.000,00 TL maddi tazminatƒ±n tahsiline karar verilmesini talep etmi≈ütir. 2.Davacƒ± vekili duru≈ümadaki beyanƒ±nda; tedbir sebebiyle baƒüƒ±msƒ±z b√∂l√ºmlerinin ge√ß teslim edileceƒüini, bundan kaynaklƒ± doƒüacak zararlarƒ± talep ettiklerini beyan etmi≈ütir. II. CEVAP Davalƒ± vekili cevap dilek√ßesinde √∂zetle; ekonomik nedenlerle 10 g√ºn i√ßerisinde dava a√ßamadƒ±klarƒ±nƒ±, 19 g√ºn sonra a√ßtƒ±klarƒ± davanƒ±n ... 4. Asliye Hukuk Mahkemesinin 2010/2

  qr = self.qdrant_client.search_batch(



üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K Sƒ∞STEM (Dense+Sparse)
1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)
2) ƒ∞nteraktif arama
3) Koleksiyon bilgilerini g√∂ster
4) √áƒ±kƒ±≈ü
üëã G√∂r√º≈ü√ºr√ºz


### ayrƒ±

In [None]:
# main.py
# SemChunk + BGE-M3 + Qdrant Entegrasyon (Dense + Sparse, 512 dim slice, L2 normalize, hibrit search)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from qdrant_client.models import NamedVector, NamedSparseVector, SparseVectorParams, SparseVector
print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# -

In [None]:

def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256


In [None]:
class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # Model
        print(f"üîÆ BGE-M3 y√ºkleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"‚úÖ Hazƒ±r - Cihaz: {device_name}")

    # Test connection & print dense+sparse
    def test_bge_connection(self):
        try:
            test_text = ["Yargƒ±tay 6. Hukuk Dairesi'nin ihtiyati tedbir kararƒ±"]
            emb_res = self.bge_model.encode(test_text)
            dense = emb_res['dense_vecs'][0] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res[0]
            sparse_available = 'colbert_vecs' in emb_res
            print(f"‚úÖ Dense embedding boyutu: {len(dense)}")
            print(f"üîç Sparse embedding mevcut: {sparse_available}")
            return len(dense)
        except Exception as e:
            print(f"‚ùå BGE-M3 baƒülantƒ± hatasƒ±: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME

        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"üóëÔ∏è Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        existing = [c.name for c in self.qdrant_client.get_collections().collections]
        if collection_name not in existing:
            try:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config={
                        "dense_vec": VectorParams(
                            size=self.config.EMBEDDING_DIM,
                            distance=Distance.COSINE
                        )
                    },
                    sparse_vectors_config={
                        "sparse_vec": SparseVectorParams(
                            index={"on_disk": False}  # Hibrid search i√ßin gerekli
                        )
                    }
                )
                print(f"‚úÖ Koleksiyon olu≈üturuldu: {collection_name} (Dense+Sparse)")
            except Exception as e:
                print(f"‚ùå Koleksiyon olu≈üturma hatasƒ±: {e}")
                raise
        else:
            print(f"‚ÑπÔ∏è Koleksiyon zaten var: {collection_name}")


    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"‚ùå Chunking hatasƒ±: {e}")
            return []

    def create_embeddings_bge(self, texts: List[str], batch_size: int = None):
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"üîÆ {total} metin i≈üleniyor (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                emb_res = self.bge_model.encode(
                    batch_texts,
                    return_dense=True,
                    return_sparse=True
                )

                dense = emb_res["dense_vecs"]
                sparse = emb_res["sparse_vecs"]   # burada dict listesi geliyor: [{"indices": [...], "values": [...]}, ...]

                # Dense i√ßin normalize
                dense_t = torch.tensor(dense, dtype=torch.float32, device=self.config.DEVICE)
                with torch.no_grad():
                    dense_slice = dense_t[:, :self.config.EMBEDDING_DIM]
                    dense_norm = l2_normalize_tensor(dense_slice)

                all_embeddings_dense.extend([v.cpu().tolist() for v in dense_norm])
                all_embeddings_sparse.extend(sparse)

                print(f"  üìä Batch i≈ülendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"‚ùå Embedding hatasƒ± (batch {i//batch_size+1}): {e}")
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([{"indices": [], "values": []} for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse


    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"üìÑ CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"üìä {len(df)} satƒ±r y√ºklendi")
        except Exception as e:
            print(f"‚ùå CSV okuma hatasƒ±: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("‚ùå Ana metin s√ºtunu bulunamadƒ±")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ‚úÖ ƒ∞≈ülenen satƒ±r: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"üß© Toplam {len(all_chunks)} chunk olu≈üturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("‚ùå Y√ºklenecek chunk yok")
            return

        print(f"üöÄ {len(chunks)} chunk Qdrant'a y√ºkleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings_bge(texts)

        points = []
        
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector={
                    "dense_vec": d,
                    "sparse_vec": SparseVector(
                        indices=s["indices"],
                        values=s["values"]
                    )
                },
                payload=c,
            ))


        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ‚úÖ Batch y√ºklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"‚ùå Batch y√ºkleme hatasƒ±: {e}")

        print("üéâ Y√ºkleme tamamlandƒ±!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])

            # Dense
            dense_q = emb_res.get('dense_vecs', emb_res)
            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
            query_vector_dense = NamedVector(
                name="dense_vec",
                vector=dense_norm[0].cpu().tolist()
            )

            # Sparse
            colbert_vec = emb_res.get("colbert_vecs", [None])[0]
            if colbert_vec is None:
                query_vector_sparse = None
            else:
                indices = list(colbert_vec.keys())
                values = list(colbert_vec.values())
                query_vector_sparse = NamedSparseVector(
                    name="sparse_vec",
                    vector=SparseVector(indices=indices, values=values)
                )

            # Tek sorguda hem dense hem sparse
            query_vectors = [query_vector_dense]
            if query_vector_sparse:
                query_vectors.append(query_vector_sparse)

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vectors,
                query_filter=None,
                limit=limit,
                with_payload=True,
                with_vectors=False,
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"üìä {len(results)} sonu√ß bulundu")
            return results
        except Exception as e:
            print(f"‚ùå Arama hatasƒ±: {e}")
            return []


    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])

            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k,v in filters.items()]
                query_filter = Filter(must=conditions)

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=query_filter,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"üìä {len(results)} filtreli sonu√ß bulundu")
            return results
        except Exception as e:
            print(f"‚ùå Filtreli arama hatasƒ±: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}



In [None]:

class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("üöÄ Full pipeline ba≈ülƒ±yor")
        emb_dim = self.processor.test_bge_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("‚ùå Chunk bulunamadƒ±")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\nüìä Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\nüîé ƒ∞nteraktif arama ba≈ülatƒ±ldƒ±")
        while True:
            print("\n1) Basit arama\n2) Filtreli arama\n3) Ana men√º")
            ch = input("Se√ßiminiz (1-3): ").strip()
            if ch=="3":
                break
            if ch not in {"1","2"}:
                print("‚ùå Ge√ßersiz se√ßim")
                continue
            q = input("üîç Arama metni (√ßƒ±kmak i√ßin 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
            try:
                limit = int(input("Ka√ß sonu√ß? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                results = self.processor.search_semantic(q, limit=limit)
            else:
                daire = input("Daire filtresi (√∂rn: '6.HukukDairesi', bo≈ü = none): ").strip()
                filters = {'daire': daire} if daire else None
                results = self.processor.advanced_search_with_filters(q, filters=filters, limit=limit)

            if not results:
                print("‚ùå Sonu√ß bulunamadƒ±")
                continue

            print(f"\nüìã {len(results)} sonu√ß:")
            for i,r in enumerate(results,1):
                p=r['payload']
                print(f"\n{i}. Skor: {r['score']:.4f}")
                print(f"   Esas No: {p.get('esas_no','N/A')} | Karar No: {p.get('karar_no','N/A')}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text',''))>300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)



In [None]:
def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_hybrid_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K Sƒ∞STEM (Dense+Sparse)")
        print("="*60)
        print("1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)")
        print("2) ƒ∞nteraktif arama")
        print("3) Koleksiyon bilgilerini g√∂ster")
        print("4) √áƒ±kƒ±≈ü")
        choice = input("Se√ßiminiz (1-4): ").strip()
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("‚úÖ Tamamlandƒ±" if ok else "‚ùå Hata √ßƒ±ktƒ±")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            print("üëã G√∂r√º≈ü√ºr√ºz")
            break
        else:
            print("‚ùå Ge√ßersiz se√ßim")

if __name__=="__main__":
    
    main()


### BOZUK

In [None]:
# main.py
# BGE-M3 + Qdrant Hybrid Search (Dense + Sparse Vectors)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import (
    VectorParams, Distance, PointStruct, 
    SparseVectorParams, SparseIndexParams,
    NamedVector,
    Filter, FieldCondition, MatchValue
)
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Any, Tuple
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# -------------------------
# Helper: normalize tensor rows (L2)
# -------------------------
def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    """L2 normalize tensor for cosine similarity"""
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

def convert_sparse_to_qdrant_format(sparse_vecs: List[Dict]) -> List[Dict]:
    """Convert BGE-M3 sparse format to Qdrant sparse format"""
    result = []
    for sparse_vec in sparse_vecs:
        if isinstance(sparse_vec, dict):
            indices = list(sparse_vec.keys())
            values = list(sparse_vec.values())
        else:
            # If it's already in indices/values format
            indices = sparse_vec.get('indices', [])
            values = sparse_vec.get('values', [])
        
        result.append({
            'indices': [int(idx) for idx in indices],
            'values': [float(val) for val in values]
        })
    return result

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_hybrid_search"
    DENSE_DIM: int = 512  # BGE-M3 dense vector dimension
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 32
    DB_BATCH: int = 64
    # Hybrid search weights
    DENSE_WEIGHT: float = 0.7
    SPARSE_WEIGHT: float = 0.3

class YargitayHybridProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # BGE-M3 Model
        print(f"üîÆ BGE-M3 y√ºkleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(
            config.BGE_MODEL_NAME, 
            use_fp16=config.USE_FP16, 
            device=config.DEVICE
        )

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"‚úÖ Hazƒ±r - Cihaz: {device_name}")

    def test_bge_connection(self) -> bool:
        """Test BGE-M3 connection and show embedding dimensions"""
        try:
            test_text = ["Yargƒ±tay 6. Hukuk Dairesi'nin ihtiyati tedbir kararƒ±"]
            embeddings = self.bge_model.encode(
                test_text,
                return_dense=True,
                return_sparse=True,
                return_colbert_vecs=False
            )
            
            dense = embeddings['dense_vecs']
            sparse = embeddings['lexical_weights']
            
            print(f"‚úÖ BGE-M3 test ba≈üarƒ±lƒ±")
            print(f"üìä Dense embedding boyutu: {len(dense[0])}")
            print(f"üìä Sparse embedding token sayƒ±sƒ±: {len(sparse[0])}")
            print(f"üîç Dense sample: {dense[0][:5]}...")
            print(f"üîç Sparse sample keys: {list(sparse[0].keys())[:5]}...")
            
            return True
        except Exception as e:
            print(f"‚ùå BGE-M3 baƒülantƒ± hatasƒ±: {e}")
            return False

    def create_qdrant_collection(self, recreate: bool = False):
        """Create Qdrant collection with hybrid vector support"""
        collection_name = self.config.COLLECTION_NAME
        
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"üóëÔ∏è Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Create collection with named vectors (dense + sparse)
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config={
                        "dense": VectorParams(
                            size=self.config.DENSE_DIM,
                            distance=Distance.COSINE
                        )
                    },
                    sparse_vectors_config={
                        "sparse": SparseVectorParams(
                            index=SparseIndexParams()
                        )
                    }
                )
                print(f"‚úÖ Hybrid koleksiyon olu≈üturuldu: {collection_name}")
                print(f"   Dense boyut: {self.config.DENSE_DIM}")
                print(f"   Sparse: Aktif")
            else:
                print(f"‚ÑπÔ∏è Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"‚ùå Koleksiyon olu≈üturma hatasƒ±: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        """Chunk text using semantic chunking"""
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"‚ùå Chunking hatasƒ±: {e}")
            return []

    def create_hybrid_embeddings(self, texts: List[str], batch_size: int = None) -> Tuple[List[List[float]], List[Dict]]:
        """Create both dense and sparse embeddings using BGE-M3"""
        batch_size = batch_size or self.config.BATCH_SIZE
        all_dense: List[List[float]] = []
        all_sparse: List[Dict] = []
        total = len(texts)
        
        print(f"üîÆ BGE-M3 ile hybrid embedding olu≈üturuluyor: {total} metin (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                # Get both dense and sparse embeddings
                embeddings = self.bge_model.encode(
                    batch_texts,
                    return_dense=True,
                    return_sparse=True,
                    return_colbert_vecs=False
                )
                
                dense_vecs = embeddings['dense_vecs']
                sparse_vecs = embeddings['lexical_weights']
                
                # Process dense vectors (normalize)
                if not isinstance(dense_vecs, torch.Tensor):
                    dense_t = torch.tensor(dense_vecs, dtype=torch.float32, device=self.config.DEVICE)
                else:
                    dense_t = dense_vecs.to(self.config.DEVICE)
                
                with torch.no_grad():
                    normed_dense = l2_normalize_tensor(dense_t)
                
                # Convert to lists
                batch_dense = [v.cpu().tolist() for v in normed_dense]
                all_dense.extend(batch_dense)
                
                # Process sparse vectors
                batch_sparse = convert_sparse_to_qdrant_format(sparse_vecs)
                all_sparse.extend(batch_sparse)

                print(f"  üìä Batch i≈ülendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

            except Exception as e:
                print(f"‚ùå Hybrid embedding hatasƒ± (batch {i//batch_size + 1}): {e}")
                # Fallback zero vectors
                fallback_dense = [[0.0] * self.config.DENSE_DIM for _ in batch_texts]
                fallback_sparse = [{'indices': [], 'values': []} for _ in batch_texts]
                all_dense.extend(fallback_dense)
                all_sparse.extend(fallback_sparse)

        print(f"‚úÖ Hybrid embeddings olu≈üturuldu: {len(all_dense)} dense, {len(all_sparse)} sparse")
        
        # Debug: verify dense vector dimensions
        if all_dense:
            sample_dense_dim = len(all_dense[0])
            print(f"üîç Dense vector boyutu kontrol√º: {sample_dense_dim} (hedef: {self.config.DENSE_DIM})")
            if sample_dense_dim != self.config.DENSE_DIM:
                print(f"‚ùå Boyut uyumsuzluƒüu tespit edildi!")
        
        return all_dense, all_sparse

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        """Process CSV file and create chunks"""
        print(f"üìÑ CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"üìä {len(df)} satƒ±r y√ºklendi")
        except Exception as e:
            print(f"‚ùå CSV okuma hatasƒ±: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("‚ùå Ana metin s√ºtunu bulunamadƒ±")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            
            if (idx + 1) % 5 == 0:
                print(f"  ‚úÖ ƒ∞≈ülenen satƒ±r: {idx + 1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"üß© Toplam {len(all_chunks)} chunk olu≈üturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        """Upload chunks with hybrid embeddings to Qdrant"""
        if not chunks:
            print("‚ùå Y√ºklenecek chunk yok")
            return

        print(f"üöÄ {len(chunks)} chunk hybrid embedding ile Qdrant'a y√ºkleniyor...")
        texts = [c['text'] for c in chunks]
        dense_embeddings, sparse_embeddings = self.create_hybrid_embeddings(texts)

        if len(dense_embeddings) != len(chunks) or len(sparse_embeddings) != len(chunks):
            print(f"‚ùå Embedding sayƒ±sƒ± uyumsuz")
            return

        points = []
        for i, (chunk, dense, sparse) in enumerate(zip(chunks, dense_embeddings, sparse_embeddings)):
            # Create vectors dictionary with named vectors
            vectors = {"dense": dense}
            
            # Only add sparse if it has data
            if sparse['indices'] and sparse['values']:
                vectors["sparse"] = {
                    "indices": sparse['indices'],
                    "values": sparse['values']
                }
            
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=vectors,
                payload=chunk
            )
            points.append(point)

        # Upload in batches
        batch_size = self.config.DB_BATCH
        for i in range(0, len(points), batch_size):
            try:
                batch_points = points[i:i+batch_size]
                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch_points
                )
                print(f"  ‚úÖ Batch y√ºklendi: {min(i+batch_size, len(points))}/{len(points)}")
            except Exception as e:
                print(f"‚ùå Batch y√ºkleme hatasƒ±: {e}")

        print("üéâ Hybrid y√ºkleme tamamlandƒ±!")

    from qdrant_client.models import SearchRequest, NamedVector, NamedSparseVector, SparseVector

def hybrid_search(self, query: str, limit: int = 10, score_threshold: float = None, 
                  dense_weight: float = None, sparse_weight: float = None) -> List[Dict]:
    
    try:
        # Kullanƒ±lacak aƒüƒ±rlƒ±klarƒ± al
        dense_weight = dense_weight or self.config.DENSE_WEIGHT
        sparse_weight = sparse_weight or self.config.SPARSE_WEIGHT

        print(f"üîç Hybrid arama: dense_weight={dense_weight}, sparse_weight={sparse_weight}")

        # Query embeddings
        embeddings = self.bge_model.encode(
            [query],
            return_dense=True,
            return_sparse=True,
            return_colbert_vecs=False
        )
        query_dense = embeddings['dense_vecs'][0]
        query_sparse = embeddings['lexical_weights'][0]

        # Dense vector normalize
        query_dense_t = torch.tensor(query_dense, dtype=torch.float32, device=self.config.DEVICE)
        with torch.no_grad():
            query_dense_norm = l2_normalize_tensor(query_dense_t).cpu().tolist()

        # Sparse vector Qdrant format
        query_sparse_qdrant = SparseVector(
            indices=[int(idx) for idx in query_sparse.keys()],
            values=[float(val) for val in query_sparse.values()]
        )

        # Search batch
        search_requests = [
            SearchRequest(
                vector=NamedVector(
                    name="dense",
                    vector=query_dense_norm
                ),
                limit=limit * 2,
                score_threshold=score_threshold,
                with_payload=True
            ),
            SearchRequest(
                vector=NamedSparseVector(
                    name="sparse",
                    vector=query_sparse_qdrant
                ),
                limit=limit * 2,
                score_threshold=score_threshold,
                with_payload=True
            )
        ]

        results = self.qdrant_client.search_batch(
            collection_name=self.config.COLLECTION_NAME,
            requests=search_requests
        )

        dense_results = results[0]
        sparse_results = results[1]

        # Combine results with weights
        combined_scores = {}
        for r in dense_results:
            combined_scores[r.id] = {
                'dense_score': r.score * dense_weight,
                'sparse_score': 0,
                'payload': r.payload
            }
        for r in sparse_results:
            if r.id in combined_scores:
                combined_scores[r.id]['sparse_score'] = r.score * sparse_weight
            else:
                combined_scores[r.id] = {
                    'dense_score': 0,
                    'sparse_score': r.score * sparse_weight,
                    'payload': r.payload
                }

        final_results = []
        for point_id, scores in combined_scores.items():
            final_score = scores['dense_score'] + scores['sparse_score']
            final_results.append({
                'score': final_score,
                'dense_score': scores['dense_score'],
                'sparse_score': scores['sparse_score'],
                'payload': scores['payload']
            })

        # Sort by final score and limit
        final_results.sort(key=lambda x: x['score'], reverse=True)
        return final_results[:limit]

    except Exception as e:
        print(f"‚ùå Hybrid arama hatasƒ±: {e}")
        import traceback
        traceback.print_exc()
        return []

    def search_dense_only(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Search using only dense vectors"""
        try:
            query_embeddings = self.bge_model.encode([query], return_dense=True, return_sparse=False)
            query_dense = query_embeddings['dense_vecs'][0]
            
            # Normalize
            query_dense_t = torch.tensor(query_dense, dtype=torch.float32, device=self.config.DEVICE)
            with torch.no_grad():
                query_dense_norm = l2_normalize_tensor(query_dense_t).cpu().tolist()
            
            results = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=("dense", query_dense_norm),
                limit=limit,
                score_threshold=score_threshold
            )
            
            return [{'score': r.score, 'payload': r.payload} for r in results]
            
        except Exception as e:
            print(f"‚ùå Dense arama hatasƒ±: {e}")
            return []

    def search_sparse_only(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Search using only sparse vectors"""
        try:
            query_embeddings = self.bge_model.encode([query], return_dense=False, return_sparse=True)
            query_sparse = query_embeddings['lexical_weights'][0]
            
            query_sparse_qdrant = {
                "name": "sparse",
                "indices": [int(idx) for idx in query_sparse.keys()],
                "values": [float(val) for val in query_sparse.values()]
            }
            
            results = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_sparse_qdrant,
                limit=limit,
                score_threshold=score_threshold
            )
            
            return [{'score': r.score, 'payload': r.payload} for r in results]
            
        except Exception as e:
            print(f"‚ùå Sparse arama hatasƒ±: {e}")
            return []

    def get_collection_info(self) -> dict:
        """Get collection information"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3 Hybrid",
                "dense_dim": self.config.DENSE_DIM,
                "sparse_enabled": True,
                "dense_weight": self.config.DENSE_WEIGHT,
                "sparse_weight": self.config.SPARSE_WEIGHT
            }
        except Exception as e:
            return {"error": str(e)}

# -------------------------
# Pipeline
# -------------------------
class YargitayHybridPipeline:
    def __init__(self, config: Config):
        self.processor = YargitayHybridProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        """Run full pipeline: CSV -> chunks -> hybrid embeddings -> Qdrant"""
        csv_path = csv_path or self.config.CSV_FILE
        print("üöÄ Hybrid pipeline ba≈ülƒ±yor")
        
        if not self.processor.test_bge_connection():
            return False
        
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        
        if not chunks:
            print("‚ùå Chunk bulunamadƒ±")
            return False
        
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        
        print("\nüìä Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        """Interactive search interface"""
        print("\nüîé ƒ∞nteraktif hybrid arama ba≈ülatƒ±ldƒ±")
        
        while True:
            print("\n" + "="*50)
            print("1) Hybrid arama (Dense + Sparse)")
            print("2) Sadece Dense arama")
            print("3) Sadece Sparse arama")
            print("4) Hybrid aƒüƒ±rlƒ±k ayarlarƒ±")
            print("5) Ana men√º")
            
            choice = input("Se√ßiminiz (1-5): ").strip()
            
            if choice == "5":
                break
            
            if choice == "4":
                try:
                    dense_w = float(input(f"Dense aƒüƒ±rlƒ±k (mevcut: {self.config.DENSE_WEIGHT}): ") or self.config.DENSE_WEIGHT)
                    sparse_w = float(input(f"Sparse aƒüƒ±rlƒ±k (mevcut: {self.config.SPARSE_WEIGHT}): ") or self.config.SPARSE_WEIGHT)
                    self.config.DENSE_WEIGHT = dense_w
                    self.config.SPARSE_WEIGHT = sparse_w
                    print(f"‚úÖ Aƒüƒ±rlƒ±klar g√ºncellendi: Dense={dense_w}, Sparse={sparse_w}")
                except ValueError:
                    print("‚ùå Ge√ßersiz deƒüer")
                continue
            
            if choice not in {"1", "2", "3"}:
                print("‚ùå Ge√ßersiz se√ßim")
                continue
            
            query = input("üîç Arama metni (√ßƒ±kmak i√ßin 'q'): ").strip()
            if query.lower() in {'q', 'quit', 'exit'}:
                break
            
            if not query:
                continue
            
            try:
                limit = int(input("Ka√ß sonu√ß? (default 5): ") or 5)
            except:
                limit = 5
            
            # Perform search based on choice
            if choice == "1":
                results = self.processor.hybrid_search(query, limit=limit)
            elif choice == "2":
                results = self.processor.search_dense_only(query, limit=limit)
            elif choice == "3":
                results = self.processor.search_sparse_only(query, limit=limit)
            
            if not results:
                print("‚ùå Sonu√ß bulunamadƒ±")
                continue

            print(f"\nüìã {len(results)} sonu√ß ({['Hybrid', 'Dense Only', 'Sparse Only'][int(choice)-1]} arama):")
            
            for i, r in enumerate(results, 1):
                p = r['payload']
                print(f"\n{i}. Skor: {r['score']:.4f}")
                
                # Show component scores for hybrid search
                if choice == "1" and 'dense_score' in r and 'sparse_score' in r:
                    print(f"   (Dense: {r['dense_score']:.4f}, Sparse: {r['sparse_score']:.4f})")
                
                print(f"   Esas No: {p.get('esas_no','N/A')} | Karar No: {p.get('karar_no','N/A')}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text','')) > 300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)

def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="yargitay_hybrid_search",
        DENSE_DIM=512,  # 512-dimensional dense vectors
        BATCH_SIZE=32,
        DB_BATCH=64,
        DENSE_WEIGHT=0.7,
        SPARSE_WEIGHT=0.3
    )

    pipeline = YargitayHybridPipeline(config)

    while True:
        print("\n" + "="*60)
        print("üèõÔ∏è YARGITAY BGE-M3 HYBRID SEARCH Sƒ∞STEMƒ∞")
        print("="*60)
        print("1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> hybrid embed -> qdrant)")
        print("2) ƒ∞nteraktif arama")
        print("3) Koleksiyon bilgilerini g√∂ster")
        print("4) √áƒ±kƒ±≈ü")
        
        choice = input("Se√ßiminiz (1-4): ").strip()
        
        if choice == "1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            success = pipeline.full_pipeline(csv_path)
            print("‚úÖ Tamamlandƒ±" if success else "‚ùå Hata olu≈ütu")
        
        elif choice == "2":
            pipeline.interactive_search()
        
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        
        elif choice == "4":
            print("üëã G√∂r√º≈ü√ºr√ºz")
            break
        
        else:
            print("‚ùå Ge√ßersiz se√ßim")

if __name__ == "__main__":
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("‚úÖ FlagEmbedding y√ºkl√º")
    except ImportError:
        print("‚ùå FlagEmbedding bulunamadƒ± ‚Äî pip install FlagEmbedding")
        raise SystemExit(1)
    
    main()

### Hƒ∞BRƒ∞T

In [None]:
# main.py
# SemChunk + BGE-M3 + Qdrant Entegrasyon (Dense + Sparse, 512 dim slice, L2 normalize, hibrit search)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# -------------------------
# Helper: normalize tensor rows (L2)
# -------------------------
def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256

class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # Model
        print(f"üîÆ BGE-M3 y√ºkleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"‚úÖ Hazƒ±r - Cihaz: {device_name}")

    # Test connection & print dense+sparse
    def test_bge_connection(self):
        try:
            test_text = ["Yargƒ±tay 6. Hukuk Dairesi'nin ihtiyati tedbir kararƒ±"]
            emb_res = self.bge_model.encode(test_text)
            dense = emb_res['dense_vecs'][0] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res[0]
            sparse_available = 'colbert_vecs' in emb_res
            print(f"‚úÖ Dense embedding boyutu: {len(dense)}")
            print(f"üîç Sparse embedding mevcut: {sparse_available}")
            return len(dense)
        except Exception as e:
            print(f"‚ùå BGE-M3 baƒülantƒ± hatasƒ±: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"üóëÔ∏è Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense + Sparse (sparse i√ßin yine 512 dim)
                vectors_config = {
                    "dense_vec": VectorParams(size=self.config.EMBEDDING_DIM, distance=Distance.COSINE),
                    "sparse_vec": VectorParams(size=self.config.EMBEDDING_DIM, distance=Distance.COSINE)
                }
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config
                )
                print(f"‚úÖ Koleksiyon olu≈üturuldu: {collection_name} (Dense+Sparse)")
            else:
                print(f"‚ÑπÔ∏è Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"‚ùå Koleksiyon olu≈üturma hatasƒ±: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"‚ùå Chunking hatasƒ±: {e}")
            return []

    def create_embeddings_bge(self, texts: List[str], batch_size: int = None):
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"üîÆ {total} metin i≈üleniyor (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                emb_res = self.bge_model.encode(batch_texts)

                dense = emb_res.get('dense_vecs', emb_res)
                sparse = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)]*len(batch_texts))

                dense_t = torch.tensor(dense, dtype=torch.float32, device=self.config.DEVICE)
                sparse_t = torch.tensor(sparse, dtype=torch.float32, device=self.config.DEVICE)

                with torch.no_grad():
                    dense_slice = dense_t[:, :self.config.EMBEDDING_DIM]
                    dense_norm = l2_normalize_tensor(dense_slice)
                    sparse_slice = sparse_t[:, :self.config.EMBEDDING_DIM]
                    sparse_norm = l2_normalize_tensor(sparse_slice)

                all_embeddings_dense.extend([v.cpu().tolist() for v in dense_norm])
                all_embeddings_sparse.extend([v.cpu().tolist() for v in sparse_norm])

                print(f"  üìä Batch i≈ülendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"‚ùå Embedding hatasƒ± (batch {i//batch_size+1}): {e}")
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"üìÑ CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"üìä {len(df)} satƒ±r y√ºklendi")
        except Exception as e:
            print(f"‚ùå CSV okuma hatasƒ±: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("‚ùå Ana metin s√ºtunu bulunamadƒ±")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ‚úÖ ƒ∞≈ülenen satƒ±r: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"üß© Toplam {len(all_chunks)} chunk olu≈üturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("‚ùå Y√ºklenecek chunk yok")
            return

        print(f"üöÄ {len(chunks)} chunk Qdrant'a y√ºkleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings_bge(texts)

        points = []
        
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            vectors={'dense_vec': d, 'sparse_vec': s} 
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector=vectors,
                payload=c,
            ))

        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ‚úÖ Batch y√ºklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"‚ùå Batch y√ºkleme hatasƒ±: {e}")

        print("üéâ Y√ºkleme tamamlandƒ±!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            if dense_q is None or any(v is None for v in dense_q[0]):
                dense_q = [np.zeros(self.config.EMBEDDING_DIM)]

            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])
            if sparse_q is None or any(v is None for v in sparse_q[0]):
                sparse_q = [np.zeros(self.config.EMBEDDING_DIM)]


            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=None,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                # Sparse hibrit param Qdrant 1.2+
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"üìä {len(results)} sonu√ß bulundu")
            return results
        except Exception as e:
            print(f"‚ùå Arama hatasƒ±: {e}")
            return []

    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])

            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k,v in filters.items()]
                query_filter = Filter(must=conditions)

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=query_filter,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"üìä {len(results)} filtreli sonu√ß bulundu")
            return results
        except Exception as e:
            print(f"‚ùå Filtreli arama hatasƒ±: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

# -------------------------
# Pipeline
# -------------------------
class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("üöÄ Full pipeline ba≈ülƒ±yor")
        emb_dim = self.processor.test_bge_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("‚ùå Chunk bulunamadƒ±")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\nüìä Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\nüîé ƒ∞nteraktif arama ba≈ülatƒ±ldƒ±")
        while True:
            print("\n1) Basit arama\n2) Filtreli arama\n3) Ana men√º")
            ch = input("Se√ßiminiz (1-3): ").strip()
            if ch=="3":
                break
            if ch not in {"1","2"}:
                print("‚ùå Ge√ßersiz se√ßim")
                continue
            q = input("üîç Arama metni (√ßƒ±kmak i√ßin 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
            try:
                limit = int(input("Ka√ß sonu√ß? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                results = self.processor.search_semantic(q, limit=limit)
            else:
                daire = input("Daire filtresi (√∂rn: '6.HukukDairesi', bo≈ü = none): ").strip()
                filters = {'daire': daire} if daire else None
                results = self.processor.advanced_search_with_filters(q, filters=filters, limit=limit)

            if not results:
                print("‚ùå Sonu√ß bulunamadƒ±")
                continue

            print(f"\nüìã {len(results)} sonu√ß:")
            for i,r in enumerate(results,1):
                p=r['payload']
                print(f"\n{i}. Skor: {r['score']:.4f}")
                print(f"   Esas No: {p.get('esas_no','N/A')} | Karar No: {p.get('karar_no','N/A')}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text',''))>300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)

def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_hybrid_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K Sƒ∞STEM (Dense+Sparse)")
        print("="*60)
        print("1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)")
        print("2) ƒ∞nteraktif arama")
        print("3) Koleksiyon bilgilerini g√∂ster")
        print("4) √áƒ±kƒ±≈ü")
        choice = input("Se√ßiminiz (1-4): ").strip()
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("‚úÖ Tamamlandƒ±" if ok else "‚ùå Hata √ßƒ±ktƒ±")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            print("üëã G√∂r√º≈ü√ºr√ºz")
            break
        else:
            print("‚ùå Ge√ßersiz se√ßim")

if __name__=="__main__":
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("‚úÖ FlagEmbedding y√ºkl√º")
    except ImportError:
        print("‚ùå FlagEmbedding bulunamadƒ± ‚Äî pip install FlagEmbedding")
        raise SystemExit(1)
    main()