In [None]:
# main.py
# Dynamic Model Selection + SemChunk + Qdrant Entegrasyon

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Optional, Tuple
import os
from qdrant_client import models
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from sentence_transformers import SentenceTransformer
from qdrant_client.models import NamedVector, NamedSparseVector, SparseVectorParams, SparseVector
from sklearn.feature_extraction.text import TfidfVectorizer
from qdrant_client.http.models import NamedVector, NamedSparseVector, SparseVector, SearchRequest

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

def l2_normalize_tensor(t: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
    if t.dim() == 1:
        norm = torch.norm(t).clamp(min=eps)
        return t / norm
    norm = torch.norm(t, dim=1, keepdim=True).clamp(min=eps)
    return t / norm

# Model yapƒ±landƒ±rmalarƒ±
MODEL_CONFIGS = {
    "bge-m3": {
        "model_name": "BAAI/bge-m3",
        "model_type": "bge",
        "embedding_dim": 1024,
        "max_seq_length": 8192,
        "supports_sparse": True,
        "supports_dense": True,
        "description": "BGE-M3 - √áok dilli, dense+sparse embedding destekli"
    },
    "bge-large": {
        "model_name": "BAAI/bge-large-en-v1.5",
        "model_type": "sentence_transformer",
        "embedding_dim": 1024,
        "max_seq_length": 512,
        "supports_sparse": False,
        "supports_dense": True,
        "description": "BGE Large - Sadece dense embedding"
    },
    "multilingual-e5": {
        "model_name": "intfloat/multilingual-e5-large",
        "model_type": "sentence_transformer", 
        "embedding_dim": 1024,
        "max_seq_length": 512,
        "supports_sparse": False,
        "supports_dense": True,
        "description": "E5 Multilingual Large - √áok dilli dense embedding"
    },
    "turkish-bert": {
        "model_name": "dbmdz/bert-base-turkish-cased",
        "model_type": "sentence_transformer",
        "embedding_dim": 768,
        "max_seq_length": 512,
        "supports_sparse": False,
        "supports_dense": True,
        "description": "Turkish BERT - T√ºrk√ße √∂zelle≈ütirilmi≈ü"
    },
    "distilbert-turkish": {   # üëà EKLENMELƒ∞
        "model_name": "dbmdz/distilbert-base-turkish-cased",
        "model_type":"sentence_transformer",
        "description": "Hƒ±zlƒ± T√ºrk√ße DistilBERT",
        "embedding_dim": 768,
        "max_seq_length": 512,
        "supports_sparse": True,
        "supports_dense": True,
        "sparse_type": "tfidf"
    },
    "all-mpnet": {
        "model_name": "sentence-transformers/all-mpnet-base-v2",
        "model_type": "sentence_transformer",
        "embedding_dim": 768,
        "max_seq_length": 384,
        "supports_sparse": False,
        "supports_dense": True,
        "description": "All-MiniLM - Genel ama√ßlƒ±, hƒ±zlƒ±"
    }
}

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    MODEL_TYPE: str = "bge"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "dynamic_model_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100
    DB_BATCH: int = 256
    SUPPORTS_SPARSE: bool = True
    SUPPORTS_DENSE: bool = True

class ModelManager:
    """Dinamik model y√∂netimi i√ßin sƒ±nƒ±f"""
    
    def __init__(self, model_key: str, config: Config):
        self.model_key = model_key
        self.config = config
        self.model_config = MODEL_CONFIGS.get(model_key)
        
        if not self.model_config:
            raise ValueError(f"Desteklenmeyen model: {model_key}")
        
        self.model = None
        self.vectorizer = None  # Sparse embedding i√ßin
        
    def load_model(self):
        """Se√ßilen modeli y√ºkle"""
        try:
            model_name = self.model_config["model_name"]
            model_type = self.model_config["model_type"]
            
            print(f"üîÆ Model y√ºkleniyor: {model_name} (tip: {model_type})")
            
            if model_type == "bge":
                self.model = BGEM3FlagModel(
                    model_name, 
                    use_fp16=self.config.USE_FP16, 
                    device=self.config.DEVICE
                )
            elif model_type == "sentence_transformer":
                self.model = SentenceTransformer(model_name, device=self.config.DEVICE)
                # Sparse embedding i√ßin TF-IDF hazƒ±rla
                if self.model_config["supports_sparse"]:
                    self.vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
            else:
                raise ValueError(f"Desteklenmeyen model tipi: {model_type}")
                
            print(f"‚úÖ Model y√ºklendi: {model_name}")
            return True
            
        except Exception as e:
            print(f"‚ùå Model y√ºkleme hatasƒ±: {e}")
            return False
    
    def encode_texts(self, texts: List[str]) -> Tuple[List[List[float]], List[Dict]]:
        """Metinleri encode et - model tipine g√∂re"""
        dense_embeddings = []
        sparse_embeddings = []
        
        try:
            if self.model_config["model_type"] == "bge" and hasattr(self.model, 'encode'):
                # BGE-M3 i√ßin
                if self.model_config["supports_sparse"]:
                    result = self.model.encode(
                        texts, 
                        return_dense=True, 
                        return_sparse=True
                    )
                    dense_embeddings = result.get("dense_vecs", [])
                    sparse_raw = result.get("sparse_vecs", [])
                    sparse_embeddings = [
                        {"indices": s.get("indices", []), "values": s.get("values", [])} 
                        for s in sparse_raw
                    ] if sparse_raw else [{"indices": [], "values": []} for _ in texts]
                else:
                    dense_embeddings = self.model.encode(texts, return_dense=True)
                    sparse_embeddings = [{"indices": [], "values": []} for _ in texts]
                    
            else:
                # Sentence Transformer i√ßin
                dense_embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
                
                # Sparse embedding TF-IDF ile
                if self.vectorizer:
                    X_sparse = self.vectorizer.fit_transform(texts)
                    sparse_embeddings = []
                    for i in range(X_sparse.shape[0]):
                        row = X_sparse[i].tocoo()
                        sparse_embeddings.append({
                            "indices": row.col.tolist(),
                            "values": row.data.tolist()
                        })
                else:
                    sparse_embeddings = [{"indices": [], "values": []} for _ in texts]
            
            # Embedding boyutunu ayarla
            target_dim = self.config.EMBEDDING_DIM
            dense_clean = []
            for vec in dense_embeddings:
                if vec is None:
                    dense_clean.append([0.0] * target_dim)
                elif len(vec) < target_dim:
                    dense_clean.append(vec + [0.0] * (target_dim - len(vec)))
                else:
                    dense_clean.append(vec[:target_dim])
            
            return dense_clean, sparse_embeddings
            
        except Exception as e:
            print(f"‚ùå Encoding hatasƒ±: {e}")
            # Fallback
            return (
                [[0.0] * self.config.EMBEDDING_DIM for _ in texts],
                [{"indices": [], "values": []} for _ in texts]
            )
    
    def get_model_info(self) -> Dict:
        """Model bilgilerini d√∂nd√ºr"""
        info = self.model_config.copy()
        info["loaded"] = self.model is not None
        info["current_embedding_dim"] = self.config.EMBEDDING_DIM
        return info

class YargitaySemanticProcessor:
    def __init__(self, config: Config, model_key: str = "bge-m3"):
        self.config = config
        self.model_manager = ModelManager(model_key, config)
        
        # Model bilgilerini config'e aktar
        model_config = self.model_manager.model_config
        self.config.BGE_MODEL_NAME = model_config["model_name"]
        self.config.MODEL_TYPE = model_config["model_type"]
        self.config.SUPPORTS_SPARSE = model_config["supports_sparse"]
        self.config.SUPPORTS_DENSE = model_config["supports_dense"]
        
        # Encoding & chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)
        
        # Model y√ºkle
        if not self.model_manager.load_model():
            raise RuntimeError("Model y√ºklenemedi!")
        
        # Qdrant
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)
        
        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"‚úÖ Hazƒ±r - Model: {model_config['model_name']} | Cihaz: {device_name}")

    def test_model_connection(self):
        """Model baƒülantƒ±sƒ±nƒ± test et"""
        try:
            test_text = ["Yargƒ±tay 6. Hukuk Dairesi'nin ihtiyati tedbir kararƒ±"]
            dense_emb, sparse_emb = self.model_manager.encode_texts(test_text) # fit_tfidf=True)
            
            sparse_method = self.model_manager.model_config.get("sparse_type", "none")
            sparse_count = len(sparse_emb[0]['indices']) if sparse_emb[0]['indices'] else 0
            
            print(f"‚úÖ Dense embedding boyutu: {len(dense_emb[0])}")
            print(f"üîç Sparse embedding: {sparse_count} terim ({sparse_method.upper()})")
            
            # if self.model_manager.fitted_tfidf:
            #     vocab_size = len(self.model_manager.tfidf_vectorizer.vocabulary_)
            #     print(f"üìö TF-IDF vocabulary: {vocab_size:,} terim")
            
            return len(dense_emb[0])
        except Exception as e:
            print(f"‚ùå Model baƒülantƒ± hatasƒ±: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"üóëÔ∏è Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense vector config
                vectors_config = {
                    "dense_vec": models.VectorParams(
                        size=self.config.EMBEDDING_DIM, 
                        distance=models.Distance.COSINE
                    ),
                }
                
                # Sparse config (eƒüer destekleniyorsa)
                sparse_config = {}
                if self.config.SUPPORTS_SPARSE:
                    sparse_config = {
                        "sparse_vec": models.SparseVectorParams(
                            index=models.SparseIndexParams(on_disk=False)
                        )
                    }
                
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config,
                    sparse_vectors_config=sparse_config if sparse_config else None
                )
                
                support_info = "Dense+Sparse" if sparse_config else "Dense only"
                print(f"‚úÖ Koleksiyon olu≈üturuldu: {collection_name} ({support_info})")
            else:
                print(f"‚ÑπÔ∏è Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"‚ùå Koleksiyon olu≈üturma hatasƒ±: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        try:
            chunks = self.chunker(text)
            result = []
            for i, c in enumerate(chunks):
                if c.strip():
                    cd = {
                        'chunk_id': i,
                        'text': c.strip(),
                        'token_count': len(self.encoding.encode(c)),
                        'char_count': len(c)
                    }
                    if metadata:
                        cd.update(metadata)
                    result.append(cd)
            return result
        except Exception as e:
            print(f"‚ùå Chunking hatasƒ±: {e}")
            return []

    def create_embeddings(self, texts: List[str], batch_size: int = None):
        """Dinamik model ile embedding olu≈ütur"""
        batch_size = batch_size or self.config.BATCH_SIZE
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"üîÆ {total} metin i≈üleniyor (model: {self.config.BGE_MODEL_NAME})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                dense, sparse = self.model_manager.encode_texts(batch_texts)
                all_embeddings_dense.extend(dense)
                all_embeddings_sparse.extend(sparse)
                
                print(f"  üìä Batch i≈ülendi: {i + len(batch_texts)}/{total}")
                
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"‚ùå Embedding hatasƒ± (batch {i//batch_size+1}): {e}")
                # Fallback
                all_embeddings_dense.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])
                all_embeddings_sparse.extend([{"indices": [], "values": []} for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        print(f"üìÑ CSV okunuyor: {csv_path}")
        try:
            df = pd.read_csv(csv_path)
            print(f"üìä {len(df)} satƒ±r y√ºklendi")
        except Exception as e:
            print(f"‚ùå CSV okuma hatasƒ±: {e}")
            return []

        text_column = next((c for c in ['rawText', 'chunk_text', 'text', 'content', 'metin'] if c in df.columns), None)
        if not text_column:
            print("‚ùå Ana metin s√ºtunu bulunamadƒ±")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
                'model_used': self.config.BGE_MODEL_NAME  # Hangi model kullanƒ±ldƒ±ƒüƒ±nƒ± kaydet
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
            if (idx+1)%5==0:
                print(f"  ‚úÖ ƒ∞≈ülenen satƒ±r: {idx+1}/{len(df)} (Toplam chunk: {len(all_chunks)})")

        print(f"üß© Toplam {len(all_chunks)} chunk olu≈üturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("‚ùå Y√ºklenecek chunk yok")
            return

        print(f"üöÄ {len(chunks)} chunk Qdrant'a y√ºkleniyor...")
        texts = [c['text'] for c in chunks]
        embeddings_dense, embeddings_sparse = self.create_embeddings(texts)

        points = []
        for c, d, s in zip(chunks, embeddings_dense, embeddings_sparse):
            vector_dict = {"dense_vec": d}
            
            # Sparse vector sadece destekleniyorsa ekle
            if self.config.SUPPORTS_SPARSE and s["indices"]:
                vector_dict["sparse_vec"] = SparseVector(
                    indices=s["indices"],
                    values=s["values"]
                )
            
            points.append(PointStruct(
                id=str(uuid.uuid4()),
                vector=vector_dict,
                payload=c,
            ))

        batch = self.config.DB_BATCH
        for i in range(0, len(points), batch):
            try:
                self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+batch])
                print(f"  ‚úÖ Batch y√ºklendi: {min(i+batch,len(points))}/{len(points)}")
            except Exception as e:
                print(f"‚ùå Batch y√ºkleme hatasƒ±: {e}")

        print("üéâ Y√ºkleme tamamlandƒ±!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Dense semantic search"""
        try:
            dense_emb, _ = self.model_manager.encode_texts([query])
            query_vector = dense_emb[0]
            
            qr = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=NamedVector(name="dense_vec", vector=query_vector),
                limit=limit,
                with_payload=True,
                score_threshold=score_threshold
            )

            results = [{"score": p.score, "payload": p.payload} for p in qr]
            print(f"üìä {len(results)} sonu√ß bulundu (Dense only)")
            return results

        except Exception as e:
            print(f"‚ùå Semantic search hatasƒ±: {e}")
            return []

    def search_hybrid(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Hybrid search (sadece destekleniyorsa)"""
        if not self.config.SUPPORTS_SPARSE:
            print("‚ö†Ô∏è Bu model sparse embedding desteklemiyor, dense search yapƒ±lƒ±yor...")
            return self.search_semantic(query, limit, score_threshold)
        
        try:
            dense_emb, sparse_emb = self.model_manager.encode_texts([query])
            
            requests = [
                SearchRequest(
                    vector=NamedVector(name="dense_vec", vector=dense_emb[0]),
                    limit=limit,
                    with_payload=True,
                    score_threshold=score_threshold
                )
            ]
            
            # Sparse varsa ekle
            if sparse_emb[0]["indices"]:
                requests.append(
                    SearchRequest(
                        vector=NamedSparseVector(
                            name="sparse_vec",
                            vector=SparseVector(
                                indices=sparse_emb[0]["indices"],
                                values=sparse_emb[0]["values"]
                            )
                        ),
                        limit=limit,
                        score_threshold=score_threshold
                    )
                )

            qr = self.qdrant_client.search_batch(
                collection_name=self.config.COLLECTION_NAME,
                requests=requests,
            )

            results = []
            for request_result in qr:
                for scored_point in request_result:
                    results.append({
                        "score": scored_point.score,
                        "payload": scored_point.payload
                    })

            print(f"üìä {len(results)} sonu√ß bulundu (Hybrid)")
            return results

        except Exception as e:
            print(f"‚ùå Hybrid search hatasƒ±: {e}")
            return []

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            model_info = self.model_manager.get_model_info()
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "model_info": model_info,
                "embedding_dim": self.config.EMBEDDING_DIM,
                "supports_sparse": self.config.SUPPORTS_SPARSE
            }
        except Exception as e:
            return {"error": str(e)}

class YargitayPipeline:
    def __init__(self, config: Config, model_key: str = "bge-m3"):
        self.processor = YargitaySemanticProcessor(config, model_key)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("üöÄ Full pipeline ba≈ülƒ±yor")
        emb_dim = self.processor.test_model_connection()
        if not emb_dim:
            return False
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("‚ùå Chunk bulunamadƒ±")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print("\nüìä Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        print("\nüîé ƒ∞nteraktif arama ba≈ülatƒ±ldƒ±")
        model_info = self.processor.model_manager.get_model_info()
        print(f"üì± Aktif model: {model_info['model_name']}")
        print(f"üîß √ñzellikler: Dense‚úÖ, Sparse{'‚úÖ' if model_info['supports_sparse'] else '‚ùå'}")
        
        while True:
            print(f"\n{'='*50}")
            print("üîç ARAMA SE√áENEKLERƒ∞")
            print(f"{'='*50}")
            print("1) Dense arama (Semantic)")
            if model_info['supports_sparse']:
                print("2) Hybrid arama (Dense + Sparse)")
                print("3) Kar≈üƒ±la≈ütƒ±rmalƒ± arama (Her iki y√∂ntem)")
            else:
                print("2) ‚ùå Hybrid arama (Bu model desteklemiyor)")
                print("3) ‚ùå Kar≈üƒ±la≈ütƒ±rma (Sparse desteklenmiyor)")
            print("4) Ana men√º")
            
            ch = input("Se√ßiminiz (1-4): ").strip()
            if ch=="4":
                break
            if ch not in {"1","2","3"}:
                print("‚ùå Ge√ßersiz se√ßim")
                continue
                
            q = input("üîç Arama metni (√ßƒ±kmak i√ßin 'q'): ").strip()
            if q.lower() in {'q','quit','exit'}:
                break
            if not q:
                continue
                
            try:
                limit = int(input("Ka√ß sonu√ß? (default 5): ") or 5)
            except:
                limit = 5

            if ch=="1":
                print("\nüéØ Dense Semantic Search...")
                results = self.processor.search_semantic(q, limit=limit)
                self._display_results(results, "Dense")
                
            elif ch=="2" and model_info['supports_sparse']:
                print("\nüîÄ Hybrid Search...")
                results = self.processor.search_hybrid(q, limit=limit)
                self._display_results(results, "Hybrid")
                
            elif ch=="3" and model_info['supports_sparse']:
                print("\nüìä Kar≈üƒ±la≈ütƒ±rmalƒ± Arama...")
                print("üéØ Dense sonu√ßlar:")
                dense_results = self.processor.search_semantic(q, limit=limit)
                self._display_results(dense_results, "Dense", show_comparison=True)
                
                print(f"\n{'='*60}")
                print("üîÄ Hybrid sonu√ßlar:")
                hybrid_results = self.processor.search_hybrid(q, limit=limit)
                self._display_results(hybrid_results, "Hybrid", show_comparison=True)
                
            else:
                print("‚ö†Ô∏è Bu √∂zellik se√ßilen model tarafƒ±ndan desteklenmiyor.")

    def _display_results(self, results: List[Dict], search_type: str, show_comparison: bool = False):
        """Sonu√ßlarƒ± g√∂r√ºnt√ºle"""
        if not results:
            print("‚ùå Sonu√ß bulunamadƒ±")
            return

        print(f"\nüìã {len(results)} {search_type} sonu√ß:")
        for i, r in enumerate(results, 1):
            p = r.get("payload") or {}
            score = r.get("score", 0.0)
            
            # Skor rengine g√∂re emoji
            if score > 0.8:
                score_icon = "üü¢"
            elif score > 0.6:
                score_icon = "üü°"
            else:
                score_icon = "üî¥"
                
            print(f"\n{i}. {score_icon} Skor: {score:.4f}")
            print(f"   üìÑ Model: {p.get('model_used','N/A')}")
            print(f"   üèõÔ∏è Daire: {p.get('daire','N/A')} | üìÖ Tarih: {p.get('tarih','N/A')}")
            print(f"   üìã Esas: {p.get('esas_no','N/A')} | üî¢ Karar: {p.get('karar_no','N/A')}")
            
            text = p.get('text', '')
            if len(text) > 200:
                text_preview = text[:200] + "..."
            else:
                text_preview = text
            print(f"   üìù Metin: {text_preview}")
            
            if show_comparison:
                print(f"   üè∑Ô∏è Tip: {search_type}")
            print("-"*60)

def select_model() -> str:
    """Kullanƒ±cƒ±dan model se√ßimi al"""
    print("\n" + "="*60)
    print("ü§ñ MODEL SE√áƒ∞Mƒ∞")
    print("="*60)
    
    # Modelleri kategorilere ayƒ±r
    categories = {
        "üåç √áok Dilli Modeller": ["bge-m3", "multilingual-e5"],
        "üáπüá∑ T√ºrk√ße √ñzel": ["turkish-bert", "distilbert-turkish"],
        "‚ö° Hƒ±zlƒ± & Genel": ["bge-large", "all-mpnet"]
    }
    
    for category, models in categories.items():
        print(f"\n{category}:")
        for model_key in models:
            config = MODEL_CONFIGS[model_key]
            sparse_type = config.get('sparse_type', 'none')
            sparse_icon = f"‚úÖ({sparse_type.upper()})" if config['supports_sparse'] else "‚ùå"
            print(f"  {model_key}: {config['description']}")
            print(f"    ‚îî‚îÄ Boyut: {config['embedding_dim']}, Sparse: {sparse_icon}, Max Token: {config['max_seq_length']}")
    
    print(f"\nüí° √ñneri:")
    print("  ‚Ä¢ T√ºrk√ße aƒüƒ±rlƒ±klƒ±: turkish-bert (TF-IDF sparse)")
    print("  ‚Ä¢ En iyi performans: bge-m3 (Native sparse)")
    print("  ‚Ä¢ Hƒ±zlƒ± T√ºrk√ße: distilbert-turkish (TF-IDF sparse)")
    print("  ‚Ä¢ √áok dilli: multilingual-e5 (TF-IDF sparse)")
    
    print(f"\nüîç Sparse Embedding T√ºrleri:")
    print("  ‚Ä¢ NATIVE: Model'in kendi sparse sistemi (sadece BGE-M3)")
    print("  ‚Ä¢ TFIDF: TF-IDF tabanlƒ± sparse embedding (t√ºm diƒüer modeller)")
    
    # while True:
    #     choice = input("\nModel se√ßin (default: bge-m3): ").strip().lower() or "bge-m3"
    #     if choice in MODEL_CONFIGS:
    #         selected_config = MODEL_CONFIGS[choice]
    #         sparse_method = selected_config.get('sparse_type', 'none')
    #         print(f"‚úÖ Se√ßilen model: {selected_config['model_name']}")
    #         print(f"üìä Sparse method: {sparse_method.upper()}")
    #         return choice
    #     print("‚ùå Ge√ßersiz model! Mevcut:", ", ".join(MODEL_CONFIGS.keys()))
    
    for key, config in MODEL_CONFIGS.items():
        print(f"{key}: {config['description']}")
        print(f"  ‚îî‚îÄ Boyut: {config['embedding_dim']}, Sparse: {config['supports_sparse']}")

    print("\nMevcut modeller:", ", ".join(MODEL_CONFIGS.keys()))

    while True:
        choice = input("\nModel se√ßin (default: bge-m3): ").strip().lower() or "bge-m3"
        if choice in MODEL_CONFIGS:
            print(f"‚úÖ Se√ßilen model: {MODEL_CONFIGS[choice]['model_name']}")
            return choice
        print("‚ùå Ge√ßersiz model! Tekrar deneyin.")

def main():
    print("üèõÔ∏è YARGITAY Dƒ∞NAMƒ∞K MODEL Sƒ∞STEMƒ∞")
    
    # Model se√ßimi
    selected_model = select_model()
    selected_config = MODEL_CONFIGS[selected_model]
    
    # Config olu≈ütur
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME=f"{selected_model}_chunks",  # Model adƒ±na g√∂re koleksiyon
        EMBEDDING_DIM=min(512, selected_config["embedding_dim"]),  # Boyutu ayarla
        BATCH_SIZE=100
    )

    try:
        pipeline = YargitayPipeline(config, selected_model)
    except Exception as e:
        print(f"‚ùå Pipeline olu≈üturma hatasƒ±: {e}")
        return

    while True:
        print("\n" + "="*60)
        print(f"üèõÔ∏è YARGITAY SEMANTƒ∞K Sƒ∞STEM - Model: {selected_config['model_name']}")
        print("="*60)
        print("1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)")
        print("2) ƒ∞nteraktif arama")
        print("3) Koleksiyon bilgilerini g√∂ster")
        print("4) Model deƒüi≈ütir")
        print("5) √áƒ±kƒ±≈ü")
        choice = input("Se√ßiminiz (1-5): ").strip()
        
        if choice=="1":
            csv_path = input(f"CSV yolu (enter ile default: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            ok = pipeline.full_pipeline(csv_path)
            print("‚úÖ Tamamlandƒ±" if ok else "‚ùå Hata √ßƒ±ktƒ±")
        elif choice=="2":
            pipeline.interactive_search()
        elif choice=="3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice=="4":
            # Model deƒüi≈ütir ve sistemi yeniden ba≈ülat
            selected_model = select_model()
            selected_config = MODEL_CONFIGS[selected_model]
            config.COLLECTION_NAME = f"{selected_model}_chunks"
            config.EMBEDDING_DIM = min(512, selected_config["embedding_dim"])
            try:
                pipeline = YargitayPipeline(config, selected_model)
                print("‚úÖ Model deƒüi≈ütirildi!")
            except Exception as e:
                print(f"‚ùå Model deƒüi≈ütirme hatasƒ±: {e}")
        elif choice=="5":
            print("üëã G√∂r√º≈ü√ºr√ºz")
            break
        else:
            print("‚ùå Ge√ßersiz se√ßim")

if __name__=="__main__":
    main()

True
üèõÔ∏è YARGITAY Dƒ∞NAMƒ∞K MODEL Sƒ∞STEMƒ∞

ü§ñ MODEL SE√áƒ∞Mƒ∞

üåç √áok Dilli Modeller:
  bge-m3: BGE-M3 - √áok dilli, dense+sparse embedding destekli
    ‚îî‚îÄ Boyut: 1024, Sparse: ‚úÖ(NONE), Max Token: 8192
  multilingual-e5: E5 Multilingual Large - √áok dilli dense embedding
    ‚îî‚îÄ Boyut: 1024, Sparse: ‚ùå, Max Token: 512

üáπüá∑ T√ºrk√ße √ñzel:
  turkish-bert: Turkish BERT - T√ºrk√ße √∂zelle≈ütirilmi≈ü
    ‚îî‚îÄ Boyut: 768, Sparse: ‚ùå, Max Token: 512
  distilbert-turkish: Hƒ±zlƒ± T√ºrk√ße DistilBERT
    ‚îî‚îÄ Boyut: 768, Sparse: ‚úÖ(TFIDF), Max Token: 512

‚ö° Hƒ±zlƒ± & Genel:
  bge-large: BGE Large - Sadece dense embedding
    ‚îî‚îÄ Boyut: 1024, Sparse: ‚ùå, Max Token: 512
  all-mpnet: All-MiniLM - Genel ama√ßlƒ±, hƒ±zlƒ±
    ‚îî‚îÄ Boyut: 768, Sparse: ‚ùå, Max Token: 384

üí° √ñneri:
  ‚Ä¢ T√ºrk√ße aƒüƒ±rlƒ±klƒ±: turkish-bert (TF-IDF sparse)
  ‚Ä¢ En iyi performans: bge-m3 (Native sparse)
  ‚Ä¢ Hƒ±zlƒ± T√ºrk√ße: distilbert-turkish (TF-IDF sparse)
  ‚Ä¢ √áok d

No sentence-transformers model found with name dbmdz/distilbert-base-turkish-cased. Creating a new one with mean pooling.


‚úÖ Model y√ºklendi: dbmdz/distilbert-base-turkish-cased
‚úÖ Hazƒ±r - Model: dbmdz/distilbert-base-turkish-cased | Cihaz: NVIDIA RTX A6000

üèõÔ∏è YARGITAY SEMANTƒ∞K Sƒ∞STEM - Model: dbmdz/distilbert-base-turkish-cased
1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)
2) ƒ∞nteraktif arama
3) Koleksiyon bilgilerini g√∂ster
4) Model deƒüi≈ütir
5) √áƒ±kƒ±≈ü
üöÄ Full pipeline ba≈ülƒ±yor
‚úÖ Dense embedding boyutu: 512
üîç Sparse embedding: 7 terim (TFIDF)
‚ùå Model baƒülantƒ± hatasƒ±: 'ModelManager' object has no attribute 'fitted_tfidf'
‚ùå Hata √ßƒ±ktƒ±

üèõÔ∏è YARGITAY SEMANTƒ∞K Sƒ∞STEM - Model: dbmdz/distilbert-base-turkish-cased
1) Tam pipeline √ßalƒ±≈ütƒ±r (CSV -> chunks -> embed -> qdrant)
2) ƒ∞nteraktif arama
3) Koleksiyon bilgilerini g√∂ster
4) Model deƒüi≈ütir
5) √áƒ±kƒ±≈ü
