In [None]:
# main.py
# SemChunk + BGE-M3 + Qdrant Entegrasyon (Normalize edilmiş, reducer uyumlu, query_points kullanıyor)

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Any
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from torch import nn

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# -------------------------
# Helper: normalize tensor rows (L2)
# -------------------------
def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    # t: (N, D)
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

# Embed reducer (1024 -> 512)
class EmbedReducer(nn.Module):
    def __init__(self, input_dim: int = 1024, output_dim: int = 512):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear(x)

# -------------------------
# Config
# -------------------------
@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256

# -------------------------
# Processor
# -------------------------
class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config

        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # Model & reducer
        print(f"🔮 BGE-M3 yükleniyor: {config.BGE_MODEL_NAME} (device={config.DEVICE})")
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)
        self.reducer = EmbedReducer(input_dim=1024, output_dim=self.config.EMBEDDING_DIM).to(config.DEVICE)

        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"✅ Hazır - Cihaz: {device_name}")

    # Test connection & print dense dim
    def test_bge_connection(self):
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            emb_res = self.bge_model.encode(test_text)
            dense = emb_res['dense_vecs'][0]
            print(f"✅ BGE-M3 test başarılı - Dense embedding boyutu: {len(dense)}")
            print(f"🔍 Sparse embedding mevcut: {'colbert_vecs' in emb_res}")
            return len(dense)
        except Exception as e:
            print(f"❌ BGE-M3 bağlantı hatası: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(size=self.config.EMBEDDING_DIM, distance=Distance.COSINE)
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} (dim={self.config.EMBEDDING_DIM})")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"❌ Chunking hatası: {e}")
            return []

    def create_embeddings_bge(self, texts: List[str], batch_size: int = None) -> List[List[float]]:
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings: List[List[float]] = []
        total = len(texts)
        print(f"🔮 BGE-M3 ile {total} metin işleniyor (batch_size={batch_size})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                # 1) embed (BGE-M3) -> dense (1024)
                emb_res = self.bge_model.encode(batch_texts)
                if isinstance(emb_res, dict) and 'dense_vecs' in emb_res:
                    dense = emb_res['dense_vecs']
                else:
                    dense = emb_res

                # 2) to tensor, device
                if not isinstance(dense, torch.Tensor):
                    dense_t = torch.tensor(dense, dtype=torch.float32, device=self.config.DEVICE)
                else:
                    dense_t = dense.to(self.config.DEVICE)

                # 3) reducer -> 512
                with torch.no_grad():
                    reduced = self.reducer(dense_t)

                # 4) normalize L2 (important for cosine)
                reduced = l2_normalize_tensor(reduced)

                # 5) append as python lists (cpu)
                all_embeddings.extend([v.cpu().tolist() for v in reduced])

                print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

            except Exception as e:
                print(f"❌ BGE-M3 Embedding hatası (batch {i//batch_size + 1}): {e}")
                # fallback zero vectors
                all_embeddings.extend([[0.0] * self.config.EMBEDDING_DIM for _ in batch_texts])

        return all_embeddings

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"📄 CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"📊 {len(df)} satır yüklendi")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx + 1) % 5 == 0:
                print(f"  ✅ İşlenen satır: {idx + 1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        print(f"🚀 {len(chunks)} chunk Qdrant'a yükleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings = self.create_embeddings_bge(texts)

        if len(embeddings) != len(chunks):
            print(f"❌ Embedding sayısı uyumsuz: {len(embeddings)} vs {len(chunks)}")
            return

        points = []
        for chunk, emb in zip(chunks, embeddings):
            points.append(PointStruct(id=str(uuid.uuid4()), vector=emb, payload=chunk))

        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ✅ Batch yüklendi: {min(i+batch, len(points))}/{len(points)}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print("🎉 Yükleme tamamlandı!")

    # ---------- SEARCH (query_points + reducer + normalize) ----------
    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        try:
            emb_res = self.bge_model.encode([query])
            q_dense = emb_res['dense_vecs'] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res

            if not isinstance(q_dense, torch.Tensor):
                q_t = torch.tensor(q_dense, dtype=torch.float32, device=self.config.DEVICE)
            else:
                q_t = q_dense.to(self.config.DEVICE)

            with torch.no_grad():
                reduced_q = self.reducer(q_t)
                reduced_q = l2_normalize_tensor(reduced_q)

            query_vector = reduced_q[0].cpu().tolist()
            print(f"🔍 Query vector boyutu: {len(query_vector)} (hedef: {self.config.EMBEDDING_DIM})")

            # use query_points (recommended)
            qr = self.qdrant_client.query_points(
                collection_name=self.config.COLLECTION_NAME,
                query=query_vector,
                limit=limit,
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr.points]
            print(f"📊 {len(results)} sonuç bulundu")
            return results

        except Exception as e:
            print(f"❌ Arama hatası: {e}")
            return []

    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        try:
            # prepare reduced + normalized query vector same as above
            emb_res = self.bge_model.encode([query])
            q_dense = emb_res['dense_vecs'] if isinstance(emb_res, dict) and 'dense_vecs' in emb_res else emb_res
            if not isinstance(q_dense, torch.Tensor):
                q_t = torch.tensor(q_dense, dtype=torch.float32, device=self.config.DEVICE)
            else:
                q_t = q_dense.to(self.config.DEVICE)

            with torch.no_grad():
                reduced_q = self.reducer(q_t)
                reduced_q = l2_normalize_tensor(reduced_q)

            query_vector = reduced_q[0].cpu().tolist()

            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k, v in filters.items()]
                query_filter = Filter(must=conditions)

            qr = self.qdrant_client.query_points(
                collection_name=self.config.COLLECTION_NAME,
                query=query_vector,
                query_filter=query_filter,
                limit=limit,
                score_threshold=score_threshold
            )

            results = [{'score': p.score, 'payload': p.payload} for p in qr.points]
            print(f"📊 {len(results)} filtreli sonuç bulundu")
            return results

        except Exception as e:
            print(f"❌ Filtreli arama hatası: {e}")
            return []

    def get_collection_info(self) -> dict:
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

# -------------------------
# Pipeline + main
# -------------------------
class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Full pipeline başlıyor")
        emb_dim = self.processor.test_bge_connection()
        if not emb_dim:
            return False
        # recreate collection to ensure clean 512-dim index
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ Chunk bulunamadı")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\n🔎 İnteraktif arama başlatıldı")
        while True:
            print("\n1) Basit arama\n2) Filtreli arama\n3) Ana menü")
            ch = input("Seçiminiz (1-3): ").strip()
            if ch == "3":
                break
            if ch not in {"1", "2"}:
                print("❌ Geçersiz seçim")
                continue
            q = input("🔍 Arama metni (çıkmak için 'q'): ").strip()
            if q.lower() in {'q', 'quit', 'exit'}:
                break
            if not q:
                continue
            try:
                limit = int(input("Kaç sonuç? (default 5): ") or 5)
            except:
                limit = 5

            if ch == "1":
                # try with low threshold first for debugging
                results = self.processor.search_semantic(q, limit=limit, score_threshold=None)
            else:
                daire = input("Daire filtresi (örn: '6.HukukDairesi', boş = none): ").strip()
                filters = {'daire': daire} if daire else None
                results = self.processor.advanced_search_with_filters(q, filters=filters, limit=limit, score_threshold=None)

            if not results:
                print("❌ Sonuç bulunamadı")
                continue

            print(f"\n📋 {len(results)} sonuç:")
            for i, r in enumerate(results, 1):
                p = r['payload']
                print(f"\n{i}. Skor: {r['score']:.4f}")
                print(f"   Esas No: {p.get('esas_no','N/A')} | Karar No: {p.get('karar_no','N/A')}")
                print(f"   Daire: {p.get('daire','N/A')} | Tarih: {p.get('tarih','N/A')}")
                text_preview = (p.get('text','')[:300] + '...') if len(p.get('text','')) > 300 else p.get('text','')
                print(f"   Metin: {text_preview}")
                print("-"*60)

def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_m3_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM")
        print("="*60)
        print("1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)")
        print("2) İnteraktif arama")
        print("3) Koleksiyon bilgilerini göster")
        print("4) Çıkış")
        choice = input("Seçiminiz (1-4): ").strip()
        if choice == "1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("✅ Tamamlandı" if ok else "❌ Hata çıktı")
        elif choice == "2":
            pipeline.interactive_search()
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice == "4":
            print("👋 Görüşürüz")
            break
        else:
            print("❌ Geçersiz seçim")

if __name__ == "__main__":
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("✅ FlagEmbedding yüklü")
    except ImportError:
        print("❌ FlagEmbedding bulunamadı — pip install FlagEmbedding")
        raise SystemExit(1)
    main()

  from .autonotebook import tqdm as notebook_tqdm


True
✅ FlagEmbedding yüklü
🔮 BGE-M3 yükleniyor: BAAI/bge-m3 (device=cuda)


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 280243.03it/s]


✅ Hazır - Cihaz: NVIDIA RTX A6000

🏛️ YARGITAY BGE-M3 SEMANTİK SİSTEM
1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)
2) İnteraktif arama
3) Koleksiyon bilgilerini göster
4) Çıkış

🔎 İnteraktif arama başlatıldı

1) Basit arama
2) Filtreli arama
3) Ana menü


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


🔍 Query vector boyutu: 512 (hedef: 512)
📊 5 sonuç bulundu

📋 5 sonuç:

1. Skor: 0.0932
   Esas No: 2022/3993 E. | Karar No: 2024/775 K.
   Daire: 6.HukukDairesisi | Tarih: 04.02.2011,07.02.2011,15.07.2012,22.10.2014,16.11.2017,22.09.2020,02.12.2020,17.02.2022,15.07.2012,16.11.2017,12.07.2018,12.07.2018,08.06.2022,28.03.2024
   Metin: kat karşılığı inşaat sözleşmesi gereğince arsa sahibine verilmesi kararlaştırılan (noter kura çekimi ile müvekkiline isabet etmiş bulunan) bağımsız bölüm olduğuna göre artık yüklenicinin edimini yerine getirip getirmediğine bakılmaksızın arsa sahibinin sözleşmeyi ayakta tutarak tapu iptali ve tescil...
------------------------------------------------------------

2. Skor: 0.0901
   Esas No: 2022/3993 E. | Karar No: 2024/775 K.
   Daire: 6.HukukDairesisi | Tarih: 04.02.2011,07.02.2011,15.07.2012,22.10.2014,16.11.2017,22.09.2020,02.12.2020,17.02.2022,15.07.2012,16.11.2017,12.07.2018,12.07.2018,08.06.2022,28.03.2024
   Metin: üzerinde yapacağı küçük bir araşt