In [None]:
# main.py
# Dynamic Model Selection + SemChunk + Qdrant Entegrasyon

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Optional, Tuple
import os
from qdrant_client import models
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from sentence_transformers import SentenceTransformer
from qdrant_client.models import NamedVector, NamedSparseVector, SparseVectorParams, SparseVector
from sklearn.feature_extraction.text import TfidfVectorizer
from qdrant_client.http.models import NamedVector, NamedSparseVector, SparseVector, SearchRequest

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

# Model yapılandırmaları
MODEL_CONFIGS = {
    "bge-m3": {
        "model_name": "BAAI/bge-m3",
        "model_type": "bge",
        "embedding_dim": 1024,
        "max_seq_length": 8192,
        "supports_sparse": True,
        "supports_dense": True,
        "description": "BGE-M3 - Çok dilli, dense+sparse embedding destekli"
    },
    "bge-large": {
        "model_name": "BAAI/bge-large-en-v1.5",
        "model_type": "sentence_transformer",
        "embedding_dim": 1024,
        "max_seq_length": 512,
        "supports_sparse": False,
        "supports_dense": True,
        "description": "BGE Large - Sadece dense embedding"
    },
    "multilingual-e5": {
        "model_name": "intfloat/multilingual-e5-large",
        "model_type": "sentence_transformer", 
        "embedding_dim": 1024,
        "max_seq_length": 512,
        "supports_sparse": False,
        "supports_dense": True,
        "description": "E5 Multilingual Large - Çok dilli dense embedding"
    },
    "turkish-bert": {
        "model_name": "dbmdz/bert-base-turkish-cased",
        "model_type": "sentence_transformer",
        "embedding_dim": 768,
        "max_seq_length": 512,
        "supports_sparse": False,
        "supports_dense": True,
        "description": "Turkish BERT - Türkçe özelleştirilmiş"
    },
    "distilbert-turkish": {   # 👈 EKLENMELİ
        "model_name": "dbmdz/distilbert-base-turkish-cased",
        "model_type":"sentence_transformer",
        "description": "Hızlı Türkçe DistilBERT",
        "embedding_dim": 768,
        "max_seq_length": 512,
        "supports_sparse": True,
        "supports_dense": True,
        "sparse_type": "tfidf"
    },
    "all-mpnet": {
        "model_name": "sentence-transformers/all-mpnet-base-v2",
        "model_type": "sentence_transformer",
        "embedding_dim": 768,
        "max_seq_length": 384,
        "supports_sparse": False,
        "supports_dense": True,
        "description": "All-MiniLM - Genel amaçlı, hızlı"
    }
}

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    MODEL_TYPE: str = "bge"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "dynamic_model_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256
    SUPPORTS_SPARSE: bool = True
    SUPPORTS_DENSE: bool = True

class ModelManager:
    """Dinamik model yönetimi için sınıf"""
    
    def __init__(self, model_key: str, config: Config):
        self.model_key = model_key
        self.config = config
        self.model_config = MODEL_CONFIGS.get(model_key)
        
        if not self.model_config:
            raise ValueError(f"Desteklenmeyen model: {model_key}")
        
        self.model = None
        self.vectorizer = None  # Sparse embedding için
        
    def load_model(self):
        """Seçilen modeli yükle"""
        try:
            model_name = self.model_config["model_name"]
            model_type = self.model_config["model_type"]
            
            print(f"🔮 Model yükleniyor: {model_name} (tip: {model_type})")
            
            if model_type == "bge":
                self.model = BGEM3FlagModel(
                    model_name, 
                    use_fp16=self.config.USE_FP16, 
                    device=self.config.DEVICE
                )
            elif model_type == "sentence_transformer":
                self.model = SentenceTransformer(model_name, device=self.config.DEVICE)
                # Sparse embedding için TF-IDF hazırla
                if self.model_config["supports_sparse"]:
                    self.vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
            else:
                raise ValueError(f"Desteklenmeyen model tipi: {model_type}")
                
            print(f"✅ Model yüklendi: {model_name}")
            return True
            
        except Exception as e:
            print(f"❌ Model yükleme hatası: {e}")
            return False
    
    def encode_texts(self, texts: List[str]) -> Tuple[List[List[float]], List[Dict]]:
        """Metinleri encode et - model tipine göre"""
        dense_embeddings = []
        sparse_embeddings = []
        
        try:
            if self.model_config["model_type"] == "bge" and hasattr(self.model, 'encode'):
                # BGE-M3 için
                if self.model_config["supports_sparse"]:
                    result = self.model.encode(
                        texts, 
                        return_dense=True, 
                        return_sparse=True
                    )
                    dense_embeddings = result.get("dense_vecs", [])
                    sparse_raw = result.get("sparse_vecs", [])
                    sparse_embeddings = [
                        {"indices": s.get("indices", []), "values": s.get("values", [])} 
                        for s in sparse_raw
                    ] if sparse_raw else [{"indices": [], "values": []} for _ in texts]
                else:
                    dense_embeddings = self.model.encode(texts, return_dense=True)
                    sparse_embeddings = [{"indices": [], "values": []} for _ in texts]
                    
            else:
                # Sentence Transformer için
                dense_embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
                
                # Sparse embedding TF-IDF ile
                if self.vectorizer:
                    X_sparse = self.vectorizer.fit_transform(texts)
                    sparse_embeddings = []
                    for i in range(X_sparse.shape[0]):
                        row = X_sparse[i].tocoo()
                        sparse_embeddings.append({
                            "indices": row.col.tolist(),
                            "values": row.data.tolist()
                        })
                else:
                    sparse_embeddings = [{"indices": [], "values": []} for _ in texts]
            
            # Embedding boyutunu ayarla
            target_dim = self.config.EMBEDDING_DIM
            dense_clean = []
            for vec in dense_embeddings:
                if vec is None:
                    dense_clean.append([0.0] * target_dim)
                elif len(vec) < target_dim:
                    dense_clean.append(vec + [0.0] * (target_dim - len(vec)))
                else:
                    dense_clean.append(vec[:target_dim])
            
            return dense_clean, sparse_embeddings
            
        except Exception as e:
            print(f"❌ Encoding hatası: {e}")
            # Fallback
            return (
                [[0.0] * self.config.EMBEDDING_DIM for _ in texts],
                [{"indices": [], "values": []} for _ in texts]
            )
    
    def get_model_info(self) -> Dict:
        """Model bilgilerini döndür"""
        info = self.model_config.copy()
        info["loaded"] = self.model is not None
        info["current_embedding_dim"] = self.config.EMBEDDING_DIM
        return info

class YargitaySemanticProcessor:
    def __init__(self, config: Config, model_key: str = "bge-m3"):
        self.config = config
        self.model_manager = ModelManager(model_key, config)
        
        # Model bilgilerini config'e aktar
        model_config = self.model_manager.model_config
        self.config.BGE_MODEL_NAME = model_config["model_name"]
        self.config.MODEL_TYPE = model_config["model_type"]
        self.config.SUPPORTS_SPARSE = model_config["supports_sparse"]
        self.config.SUPPORTS_DENSE = model_config["supports_dense"]
        
        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)
        
        # Model yükle
        if not self.model_manager.load_model():
            raise RuntimeError("Model yüklenemedi!")
        
        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)
        
        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"✅ Hazır - Model: {model_config['model_name']} | Cihaz: {device_name}")

    def test_model_connection(self):
        """Model bağlantısını test et"""
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            dense_emb, sparse_emb = self.model_manager.encode_texts(test_text) # fit_tfidf=True)
            
            sparse_method = self.model_manager.model_config.get("sparse_type", "none")
            sparse_count = len(sparse_emb[0]['indices']) if sparse_emb[0]['indices'] else 0
            
            print(f"✅ Dense embedding boyutu: {len(dense_emb[0])}")
            print(f"🔍 Sparse embedding: {sparse_count} terim ({sparse_method.upper()})")
            
            # if self.model_manager.fitted_tfidf:
            #     vocab_size = len(self.model_manager.tfidf_vectorizer.vocabulary_)
            #     print(f"📚 TF-IDF vocabulary: {vocab_size:,} terim")
            
            return len(dense_emb[0])
        except Exception as e:
            print(f"❌ Model bağlantı hatası: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense vector config
                vectors_config = {
                    "dense_vec": models.VectorParams(
                        size=self.config.EMBEDDING_DIM, 
                        distance=models.Distance.COSINE
                    ),
                }
                
                # Sparse config (eğer destekleniyorsa)
                sparse_config = {}
                if self.config.SUPPORTS_SPARSE:
                    sparse_config = {
                        "sparse_vec": models.SparseVectorParams(
                            index=models.SparseIndexParams(on_disk=False)
                        )
                    }
                
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config,
                    sparse_vectors_config=sparse_config if sparse_config else None
                )
                
                support_info = "Dense+Sparse" if sparse_config else "Dense only"
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} ({support_info})")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"❌ Chunking hatası: {e}")
            return []

    def create_embeddings(self, texts: List[str], batch_size: int = None):
        """Dinamik model ile embedding oluştur"""
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"🔮 {total} metin işleniyor (model: {self.config.BGE_MODEL_NAME})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                dense, sparse = self.model_manager.encode_texts(batch_texts)
                all_embeddings_dense.extend(dense)
                all_embeddings_sparse.extend(sparse)
                
                print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")
                
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"❌ Embedding hatası (batch {i//batch_size+1}): {e}")
                # Fallback
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([{"indices": [], "values": []} for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"📄 CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"📊 {len(df)} satır yüklendi")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
                'model_used': self.config.BGE_MODEL_NAME  # Hangi model kullanıldığını kaydet
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ✅ İşlenen satır: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        print(f"🚀 {len(chunks)} chunk Qdrant'a yükleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings(texts)

        points = []
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            vector_dict = {"dense_vec": d}
            
            # Sparse vector sadece destekleniyorsa ekle
            if self.config.SUPPORTS_SPARSE and s["indices"]:
                vector_dict["sparse_vec"] = SparseVector(
                    indices=s["indices"],
                    values=s["values"]
                )
            
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector=vector_dict,
                payload=c,
            ))

        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ✅ Batch yüklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print("🎉 Yükleme tamamlandı!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Dense semantic search"""
        try:
            dense_emb, _ = self.model_manager.encode_texts([query])
            query_vector = dense_emb[0]
            
            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=NamedVector(name="dense_vec", vector=query_vector),
                limit=limit,
                with_payload=True,
                score_threshold=score_threshold
            )

            results = [{"score": p.score, "payload": p.payload} for p in qr]
            print(f"📊 {len(results)} sonuç bulundu (Dense only)")
            return results

        except Exception as e:
            print(f"❌ Semantic search hatası: {e}")
            return []

    def search_hybrid(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Hybrid search (sadece destekleniyorsa)"""
        if not self.config.SUPPORTS_SPARSE:
            print("⚠️ Bu model sparse embedding desteklemiyor, dense search yapılıyor...")
            return self.search_semantic(query, limit, score_threshold)
        
        try:
            dense_emb, sparse_emb = self.model_manager.encode_texts([query])
            
            requests = [
                SearchRequest(
                    vector=NamedVector(name="dense_vec", vector=dense_emb[0]),
                    limit=limit,
                    with_payload=True,
                    score_threshold=score_threshold
                )
            ]
            
            # Sparse varsa ekle
            if sparse_emb[0]["indices"]:
                requests.append(
                    SearchRequest(
                        vector=NamedSparseVector(
                            name="sparse_vec",
                            vector=SparseVector(
                                indices=sparse_emb[0]["indices"],
                                values=sparse_emb[0]["values"]
                            )
                        ),
                        limit=limit,
                        score_threshold=score_threshold
                    )
                )

            qr = self.qdrant_client.search_batch(
                collection_name=self.config.COLLECTION_NAME,
                requests=requests,
            )

            results = []
            for request_result in qr:
                for scored_point in request_result:
                    results.append({
                        "score": scored_point.score,
                        "payload": scored_point.payload
                    })

            print(f"📊 {len(results)} sonuç bulundu (Hybrid)")
            return results

        except Exception as e:
            print(f"❌ Hybrid search hatası: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            model_info = self.model_manager.get_model_info()
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "model_info": model_info,
                "embedding_dim": self.config.EMBEDDING_DIM,
                "supports_sparse": self.config.SUPPORTS_SPARSE
            }
        except Exception as e:
            return {"error": str(e)}

class YargitayPipeline:
    def __init__(self, config: Config, model_key: str = "bge-m3"):
        self.processor = YargitaySemanticProcessor(config, model_key)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Full pipeline başlıyor")
        emb_dim = self.processor.test_model_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ Chunk bulunamadı")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\n🔎 İnteraktif arama başlatıldı")
        model_info = self.processor.model_manager.get_model_info()
        print(f"📱 Aktif model: {model_info['model_name']}")
        print(f"🔧 Özellikler: Dense✅, Sparse{'✅' if model_info['supports_sparse'] else '❌'}")
        
        while True:
            print(f"\n{'='*50}")
            print("🔍 ARAMA SEÇENEKLERİ")
            print(f"{'='*50}")
            print("1) Dense arama (Semantic)")
            if model_info['supports_sparse']:
                print("2) Hybrid arama (Dense + Sparse)")
                print("3) Karşılaştırmalı arama (Her iki yöntem)")
            else:
                print("2) ❌ Hybrid arama (Bu model desteklemiyor)")
                print("3) ❌ Karşılaştırma (Sparse desteklenmiyor)")
            print("4) Ana menü")
            
            ch = input("Seçiminiz (1-4): ").strip()
            if ch=="4":
                break
            if ch not in {"1","2","3"}:
                print("❌ Geçersiz seçim")
                continue
                
            q = input("🔍 Arama metni (çıkmak için 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
                
            try:
                limit = int(input("Kaç sonuç? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                print("\n🎯 Dense Semantic Search...")
                results = self.processor.search_semantic(q, limit=limit)
                self._display_results(results, "Dense")
                
            elif ch=="2" and model_info['supports_sparse']:
                print("\n🔀 Hybrid Search...")
                results = self.processor.search_hybrid(q, limit=limit)
                self._display_results(results, "Hybrid")
                
            elif ch=="3" and model_info['supports_sparse']:
                print("\n📊 Karşılaştırmalı Arama...")
                print("🎯 Dense sonuçlar:")
                dense_results = self.processor.search_semantic(q, limit=limit)
                self._display_results(dense_results, "Dense", show_comparison=True)
                
                print(f"\n{'='*60}")
                print("🔀 Hybrid sonuçlar:")
                hybrid_results = self.processor.search_hybrid(q, limit=limit)
                self._display_results(hybrid_results, "Hybrid", show_comparison=True)
                
            else:
                print("⚠️ Bu özellik seçilen model tarafından desteklenmiyor.")

    def _display_results(self, results: List[Dict], search_type: str, show_comparison: bool = False):
        """Sonuçları görüntüle"""
        if not results:
            print("❌ Sonuç bulunamadı")
            return

        print(f"\n📋 {len(results)} {search_type} sonuç:")
        for i, r in enumerate(results, 1):
            p = r.get("payload") or {}
            score = r.get("score", 0.0)
            
            # Skor rengine göre emoji
            if score > 0.8:
                score_icon = "🟢"
            elif score > 0.6:
                score_icon = "🟡"
            else:
                score_icon = "🔴"
                
            print(f"\n{i}. {score_icon} Skor: {score:.4f}")
            print(f"   📄 Model: {p.get('model_used','N/A')}")
            print(f"   🏛️ Daire: {p.get('daire','N/A')} | 📅 Tarih: {p.get('tarih','N/A')}")
            print(f"   📋 Esas: {p.get('esas_no','N/A')} | 🔢 Karar: {p.get('karar_no','N/A')}")
            
            text = p.get('text', '')
            if len(text) > 200:
                text_preview = text[:200] + "..."
            else:
                text_preview = text
            print(f"   📝 Metin: {text_preview}")
            
            if show_comparison:
                print(f"   🏷️ Tip: {search_type}")
            print("-"*60)

def select_model() -> str:
    """Kullanıcıdan model seçimi al"""
    print("\n" + "="*60)
    print("🤖 MODEL SEÇİMİ")
    print("="*60)
    
    # Modelleri kategorilere ayır
    categories = {
        "🌍 Çok Dilli Modeller": ["bge-m3", "multilingual-e5"],
        "🇹🇷 Türkçe Özel": ["turkish-bert", "distilbert-turkish"],
        "⚡ Hızlı & Genel": ["bge-large", "all-mpnet"]
    }
    
    for category, models in categories.items():
        print(f"\n{category}:")
        for model_key in models:
            config = MODEL_CONFIGS[model_key]
            sparse_type = config.get('sparse_type', 'none')
            sparse_icon = f"✅({sparse_type.upper()})" if config['supports_sparse'] else "❌"
            print(f"  {model_key}: {config['description']}")
            print(f"    └─ Boyut: {config['embedding_dim']}, Sparse: {sparse_icon}, Max Token: {config['max_seq_length']}")
    
    print(f"\n💡 Öneri:")
    print("  • Türkçe ağırlıklı: turkish-bert (TF-IDF sparse)")
    print("  • En iyi performans: bge-m3 (Native sparse)")
    print("  • Hızlı Türkçe: distilbert-turkish (TF-IDF sparse)")
    print("  • Çok dilli: multilingual-e5 (TF-IDF sparse)")
    
    print(f"\n🔍 Sparse Embedding Türleri:")
    print("  • NATIVE: Model'in kendi sparse sistemi (sadece BGE-M3)")
    print("  • TFIDF: TF-IDF tabanlı sparse embedding (tüm diğer modeller)")
    
    # while True:
    #     choice = input("\nModel seçin (default: bge-m3): ").strip().lower() or "bge-m3"
    #     if choice in MODEL_CONFIGS:
    #         selected_config = MODEL_CONFIGS[choice]
    #         sparse_method = selected_config.get('sparse_type', 'none')
    #         print(f"✅ Seçilen model: {selected_config['model_name']}")
    #         print(f"📊 Sparse method: {sparse_method.upper()}")
    #         return choice
    #     print("❌ Geçersiz model! Mevcut:", ", ".join(MODEL_CONFIGS.keys()))
    
    for key, config in MODEL_CONFIGS.items():
        print(f"{key}: {config['description']}")
        print(f"  └─ Boyut: {config['embedding_dim']}, Sparse: {config['supports_sparse']}")

    print("\nMevcut modeller:", ", ".join(MODEL_CONFIGS.keys()))

    while True:
        choice = input("\nModel seçin (default: bge-m3): ").strip().lower() or "bge-m3"
        if choice in MODEL_CONFIGS:
            print(f"✅ Seçilen model: {MODEL_CONFIGS[choice]['model_name']}")
            return choice
        print("❌ Geçersiz model! Tekrar deneyin.")

def main():
    print("🏛️ YARGITAY DİNAMİK MODEL SİSTEMİ")
    
    # Model seçimi
    selected_model = select_model()
    selected_config = MODEL_CONFIGS[selected_model]
    
    # Config oluştur
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME=f"{selected_model}_chunks",  # Model adına göre koleksiyon
        EMBEDDING_DIM=min(512, selected_config["embedding_dim"]),  # Boyutu ayarla
        BATCH_SIZE=100
    )

    try:
        pipeline = YargitayPipeline(config, selected_model)
    except Exception as e:
        print(f"❌ Pipeline oluşturma hatası: {e}")
        return

    while True:
        print("\n" + "="*60)
        print(f"🏛️ YARGITAY SEMANTİK SİSTEM - Model: {selected_config['model_name']}")
        print("="*60)
        print("1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)")
        print("2) İnteraktif arama")
        print("3) Koleksiyon bilgilerini göster")
        print("4) Model değiştir")
        print("5) Çıkış")
        choice = input("Seçiminiz (1-5): ").strip()
        
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("✅ Tamamlandı" if ok else "❌ Hata çıktı")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            # Model değiştir ve sistemi yeniden başlat
            selected_model = select_model()
            selected_config = MODEL_CONFIGS[selected_model]
            config.COLLECTION_NAME = f"{selected_model}_chunks"
            config.EMBEDDING_DIM = min(512, selected_config["embedding_dim"])
            try:
                pipeline = YargitayPipeline(config, selected_model)
                print("✅ Model değiştirildi!")
            except Exception as e:
                print(f"❌ Model değiştirme hatası: {e}")
        elif choice=="5":
            print("👋 Görüşürüz")
            break
        else:
            print("❌ Geçersiz seçim")

if __name__=="__main__":
    main()

True
🏛️ YARGITAY DİNAMİK MODEL SİSTEMİ

🤖 MODEL SEÇİMİ

🌍 Çok Dilli Modeller:
  bge-m3: BGE-M3 - Çok dilli, dense+sparse embedding destekli
    └─ Boyut: 1024, Sparse: ✅(NONE), Max Token: 8192
  multilingual-e5: E5 Multilingual Large - Çok dilli dense embedding
    └─ Boyut: 1024, Sparse: ❌, Max Token: 512

🇹🇷 Türkçe Özel:
  turkish-bert: Turkish BERT - Türkçe özelleştirilmiş
    └─ Boyut: 768, Sparse: ❌, Max Token: 512
  distilbert-turkish: Hızlı Türkçe DistilBERT
    └─ Boyut: 768, Sparse: ✅(TFIDF), Max Token: 512

⚡ Hızlı & Genel:
  bge-large: BGE Large - Sadece dense embedding
    └─ Boyut: 1024, Sparse: ❌, Max Token: 512
  all-mpnet: All-MiniLM - Genel amaçlı, hızlı
    └─ Boyut: 768, Sparse: ❌, Max Token: 384

💡 Öneri:
  • Türkçe ağırlıklı: turkish-bert (TF-IDF sparse)
  • En iyi performans: bge-m3 (Native sparse)
  • Hızlı Türkçe: distilbert-turkish (TF-IDF sparse)
  • Çok dilli: multilingual-e5 (TF-IDF sparse)

🔍 Sparse Embedding Türleri:
  • NATIVE: Model'in kendi sparse siste

No sentence-transformers model found with name dbmdz/distilbert-base-turkish-cased. Creating a new one with mean pooling.


✅ Model yüklendi: dbmdz/distilbert-base-turkish-cased
✅ Hazır - Model: dbmdz/distilbert-base-turkish-cased | Cihaz: NVIDIA RTX A6000

🏛️ YARGITAY SEMANTİK SİSTEM - Model: dbmdz/distilbert-base-turkish-cased
1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)
2) İnteraktif arama
3) Koleksiyon bilgilerini göster
4) Model değiştir
5) Çıkış
🚀 Full pipeline başlıyor
✅ Dense embedding boyutu: 512
🔍 Sparse embedding: 7 terim (TFIDF)
❌ Model bağlantı hatası: 'ModelManager' object has no attribute 'fitted_tfidf'
❌ Hata çıktı

🏛️ YARGITAY SEMANTİK SİSTEM - Model: dbmdz/distilbert-base-turkish-cased
1) Tam pipeline çalıştır (CSV -> chunks -> embed -> qdrant)
2) İnteraktif arama
3) Koleksiyon bilgilerini göster
4) Model değiştir
5) Çıkış
