In [None]:
# main.py
# SemChunk + BGE-M3 + Qdrant Entegrasyon (Dense + Sparse, 512 dim slice, L2 normalize, hibrit search)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# -------------------------
# Helper: normalize tensor rows (L2)
# -------------------------
def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256

class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # Model
        print(f"🔮 BGE-M3 yükleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"✅ Hazır - Cihaz: {device_name}")

    # Test connection & print dense+sparse
    def test_bge_connection(self):
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            emb_res = self.bge_model.encode(test_text)
            dense = emb_res['dense_vecs'][0] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res[0]
            sparse_available = 'colbert_vecs' in emb_res
            print(f"✅ Dense embedding boyutu: {len(dense)}")
            print(f"🔍 Sparse embedding mevcut: {sparse_available}")
            return len(dense)
        except Exception as e:
            print(f"❌ BGE-M3 bağlantı hatası: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense + Sparse (sparse için yine 512 dim)
                vectors_config = {
                    "dense_vec": VectorParams(size=self.config.EMBEDDING_DIM, distance=Distance.COSINE),
                    "sparse_vec": VectorParams(size=self.config.EMBEDDING_DIM, distance=Distance.COSINE)
                }
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} (Dense+Sparse)")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"❌ Chunking hatası: {e}")
            return []

    def create_embeddings_bge(self, texts: List[str], batch_size: int = None):
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"🔮 {total} metin işleniyor (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                emb_res = self.bge_model.encode(batch_texts)

                dense = emb_res.get('dense_vecs', emb_res)
                sparse = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)]*len(batch_texts))

                dense_t = torch.tensor(dense, dtype=torch.float32, device=self.config.DEVICE)
                sparse_t = torch.tensor(sparse, dtype=torch.float32, device=self.config.DEVICE)

                with torch.no_grad():
                    dense_slice = dense_t[:, :self.config.EMBEDDING_DIM]
                    dense_norm = l2_normalize_tensor(dense_slice)
                    sparse_slice = sparse_t[:, :self.config.EMBEDDING_DIM]
                    sparse_norm = l2_normalize_tensor(sparse_slice)

                all_embeddings_dense.extend([v.cpu().tolist() for v in dense_norm])
                all_embeddings_sparse.extend([v.cpu().tolist() for v in sparse_norm])

                print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"❌ Embedding hatası (batch {i//batch_size+1}): {e}")
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"📄 CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"📊 {len(df)} satır yüklendi")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ✅ İşlenen satır: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        print(f"🚀 {len(chunks)} chunk Qdrant'a yükleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings_bge(texts)

        points = []
        
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            vectors={'dense_vec': d, 'sparse_vec': s} 
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector=vectors,
                payload=c,
            ))

        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ✅ Batch yüklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print("🎉 Yükleme tamamlandı!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            if dense_q is None or any(v is None for v in dense_q[0]):
                dense_q = [np.zeros(self.config.EMBEDDING_DIM)]

            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])
            if sparse_q is None or any(v is None for v in sparse_q[0]):
                sparse_q = [np.zeros(self.config.EMBEDDING_DIM)]


            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=None,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                # Sparse hibrit param Qdrant 1.2+
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"📊 {len(results)} sonuç bulundu")
            return results
        except Exception as e:
            print(f"❌ Arama hatası: {e}")
            return []

    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = None):
        try:
            emb_res = self.bge_model.encode([query])
            dense_q = emb_res.get('dense_vecs', emb_res)
            sparse_q = emb_res.get('colbert_vecs', [np.zeros(self.config.EMBEDDING_DIM)])

            dense_t = torch.tensor(dense_q, dtype=torch.float32, device=self.config.DEVICE)
            sparse_t = torch.tensor(sparse_q, dtype=torch.float32, device=self.config.DEVICE)

            with torch.no_grad():
                dense_norm = l2_normalize_tensor(dense_t[:, :self.config.EMBEDDING_DIM])
                sparse_norm = l2_normalize_tensor(sparse_t[:, :self.config.EMBEDDING_DIM])

            query_vector = dense_norm[0].cpu().tolist()
            query_sparse = sparse_norm[0].cpu().tolist()

            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k,v in filters.items()]
                query_filter = Filter(must=conditions)

            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                query_filter=query_filter,
                limit=limit,
                with_payload=True,
                params={"hnsw_ef": 128},
                vector_name="dense_vec",
                query_vector_sparse=query_sparse,
                vector_name_sparse="sparse_vec",
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr]
            print(f"📊 {len(results)} filtreli sonuç bulundu")
            return results
        except Exception as e:
            print(f"❌ Filtreli arama hatası: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

# -------------------------
# Pipeline
# -------------------------
class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Full pipeline başlıyor")
        emb_dim = self.processor.test_bge_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ Chunk bulunamadı")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\n🔎 İnteraktif arama başlatıldı")
        while True:
            print("\n1) Basit arama\n2) Filtreli arama\n3) Ana menü")
            ch = input("Seçiminiz (1-3): ").strip()
            if ch=="3":
                break
            if ch not in {"1","2"}:
                print("❌ Geçersiz seçim")
                continue
            q = input("🔍 Arama metni (çıkmak için 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
            try:
                limit = int(input("Kaç sonuç? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                results = self.processor.search_semantic(q, limit=limit)
            else:
                daire = input("Daire filtresi (örn: '6.HukukDairesi', boş = none): ").strip()
                filters = {'daire': daire} if daire else None
                results = self.processor.advanced_search_with_filters(q, filters=filters, limit=limit)

            if not results:
                print("❌ Sonuç bulunamadı")
                continue

            print(f"\n📋 {len(results)} sonuç:")
            for i,r in enumerate(results,1):
                p=r['payload']
                print(f"\n{i}. Skor: {r['score']:.4f}")
                print(f"   Esas No: {p.get('esas_no','N/A')} | Karar No: {p.get('karar_no','N/A')}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text',''))>300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)

def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_hybrid_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)")
        print("="*60)
        print("1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)")
        print("2) İnteraktif arama")
        print("3) Koleksiyon bilgilerini göster")
        print("4) Çıkış")
        choice = input("Seçiminiz (1-4): ").strip()
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("✅ Tamamlandı" if ok else "❌ Hata çıktı")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            print("👋 Görüşürüz")
            break
        else:
            print("❌ Geçersiz seçim")

if __name__=="__main__":
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("✅ FlagEmbedding yüklü")
    except ImportError:
        print("❌ FlagEmbedding bulunamadı — pip install FlagEmbedding")
        raise SystemExit(1)
    main()


True
✅ FlagEmbedding yüklü
🔮 BGE-M3 yükleniyor: BAAI/bge-m3 (device=cuda)


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 8080.99it/s]


✅ Hazır - Cihaz: NVIDIA RTX A6000

🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)
1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)
2) İnteraktif arama
3) Koleksiyon bilgilerini göster
4) Çıkış
🚀 Full pipeline başlıyor


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Dense embedding boyutu: 1024
🔍 Sparse embedding mevcut: True
🗑️ Eski koleksiyon silindi: bge_hybrid_chunks
✅ Koleksiyon oluşturuldu: bge_hybrid_chunks (Dense+Sparse)
📄 CSV okunuyor: /home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv
📊 10 satır yüklendi
  ✅ İşlenen satır: 5/10 (Toplam chunk: 44)
  ✅ İşlenen satır: 10/10 (Toplam chunk: 59)
🧩 Toplam 59 chunk oluşturuldu
🚀 59 chunk Qdrant'a yükleniyor...
🔮 59 metin işleniyor (batch_size=100)...
❌ Embedding hatası (batch 1): must be real number, not NoneType
  ✅ Batch yüklendi: 59/59
🎉 Yükleme tamamlandı!

📊 Koleksiyon Bilgileri:
{
  "collection_name": "bge_hybrid_chunks",
  "points_count": 59,
  "vectors_count": null,
  "status": "green",
  "embedding_model": "BGE-M3",
  "embedding_dim": 512
}
✅ Tamamlandı

🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM (Dense+Sparse)
1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)
2) İnteraktif arama
3) Koleksiyon bilgilerini göster
4) Çıkış

🔎 İnteraktif arama başlatıldı

1) Basit arama
2) Filtreli a