### cohere-multilingual-v3

In [2]:
# SemChunk + Cohere Multilingual + Qdrant Entegrasyon
# Yargƒ±tay Kararlarƒ± i√ßin Semantic Chunking Pipeline

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import cohere
import numpy as np
import uuid
from typing import List, Dict, Any
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

cohere_api_key=os.getenv("COHERE_API_KEY")
# Konfig√ºrasyon
@dataclass
class Config:
    # Cohere ayarlarƒ±
    COHERE_API_KEY: str = cohere_api_key  # Cohere API anahtarƒ±nƒ±z
    COHERE_MODEL: str = "embed-multilingual-v3.0"  # Cohere multilingual model
    
    # SemChunk ayarlarƒ±
    TOKEN_SIZE: int = 384  # Chunk boyutu (token)
    ENCODING_NAME: str = "cl100k_base"  # Tiktoken encoding
    
    # Qdrant ayarlarƒ±
    QDRANT_URL: str = "http://localhost:6333"  # Lokal Qdrant
    COLLECTION_NAME: str = "yargitay_semantic_chunks"
    DIMENSION: int = 1024  # Cohere multilingual embedding boyutu
    
    # Dosya ayarlarƒ±
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/10data.csv"
    BATCH_SIZE: int = 10

class YargitaySemanticProcessor:
    """Yargƒ±tay kararlarƒ± i√ßin semantic chunking ve vector search"""
    
    def __init__(self, config: Config):
        self.config = config
        
        # SemChunk chunker olu≈ütur
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)
        
        # Cohere client olu≈ütur
        self.cohere_client = cohere.Client(config.COHERE_API_KEY)
        
        # Qdrant client olu≈ütur
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)
        
        print(f"‚úÖ SemChunk chunker hazƒ±r (Token boyutu: {config.TOKEN_SIZE})")
        print(f"‚úÖ Cohere client hazƒ±r ({config.COHERE_MODEL})")
        print(f"‚úÖ Qdrant client hazƒ±r ({config.QDRANT_URL})")
    
    def test_cohere_connection(self):
        """Cohere baƒülantƒ±sƒ±nƒ± test et"""
        try:
            test_response = self.cohere_client.embed(
                texts=["Bu bir test metnidir"],
                model=self.config.COHERE_MODEL,
                input_type="search_document"
            )
            embedding_dim = len(test_response.embeddings[0])
            print(f"‚úÖ Cohere test ba≈üarƒ±lƒ± - Embedding boyutu: {embedding_dim}")
            return embedding_dim
        except Exception as e:
            print(f"‚ùå Cohere baƒülantƒ± hatasƒ±: {e}")
            return None
    
    def create_qdrant_collection(self, recreate: bool = False):
        """Qdrant koleksiyonu olu≈ütur"""
        collection_name = self.config.COLLECTION_NAME
        
        # Koleksiyon varsa ve recreate True ise sil
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"üóëÔ∏è Eski koleksiyon silindi: {collection_name}")
            except:
                pass
        
        # Koleksiyon yoksa olu≈ütur
        try:
            collections = self.qdrant_client.get_collections().collections
            collection_names = [c.name for c in collections]
            
            if collection_name not in collection_names:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(
                        size=self.config.DIMENSION,
                        distance=Distance.COSINE
                    )
                )
                print(f"‚úÖ Koleksiyon olu≈üturuldu: {collection_name}")
            else:
                print(f"‚ÑπÔ∏è Koleksiyon zaten var: {collection_name}")
                
        except Exception as e:
            print(f"‚ùå Koleksiyon olu≈üturma hatasƒ±: {e}")
            raise
    
    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        """Metni semantic olarak chunk'lara b√∂l"""
        if not text or text.strip() == "":
            return []
        
        try:
            # SemChunk ile metni b√∂l
            chunks = self.chunker(text)
            
            result_chunks = []
            for i, chunk_text in enumerate(chunks):
                if chunk_text.strip():  # Bo≈ü chunk'larƒ± atla
                    chunk_data = {
                        'chunk_id': i,
                        'text': chunk_text.strip(),
                        'token_count': len(self.encoding.encode(chunk_text)),
                        'char_count': len(chunk_text),
                    }
                    
                    # Metadata ekle
                    if metadata:
                        chunk_data.update(metadata)
                    
                    result_chunks.append(chunk_data)
            
            return result_chunks
            
        except Exception as e:
            print(f"‚ùå Chunking hatasƒ±: {e}")
            return []
    
    def create_embeddings(self, texts: List[str], batch_size: int = 10) -> List[List[float]]:
        """Metinleri Cohere ile embedding'e √ßevir"""
        all_embeddings = []
        
        # Cohere API limitleri i√ßin batch processing
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            try:
                response = self.cohere_client.embed(
                    texts=batch_texts,
                    model=self.config.COHERE_MODEL,
                    input_type="search_document"  # Dokuman indexleme i√ßin
                )
                
                batch_embeddings = response.embeddings
                all_embeddings.extend(batch_embeddings)
                
                print(f"  üìä Embedding olu≈üturuldu: {i+len(batch_texts)}/{len(texts)}")
                
            except Exception as e:
                print(f"‚ùå Embedding hatasƒ± (batch {i//batch_size + 1}): {e}")
                # Hata durumunda bo≈ü embedding ekle
                all_embeddings.extend([[0.0] * self.config.EMBEDDING_DIM] * len(batch_texts))
        
        return all_embeddings
    
    def process_csv_file(self, csv_path: str) -> List[Dict]:
        """CSV dosyasƒ±nƒ± i≈üle ve chunk'larƒ± olu≈ütur"""
        print(f"üìÑ CSV dosyasƒ± okunuyor: {csv_path}")
        
        try:
            df = pd.read_csv(csv_path)
            print(f"üìä {len(df)} satƒ±r veri y√ºklendi")
        except Exception as e:
            print(f"‚ùå CSV okuma hatasƒ±: {e}")
            return []
        
        # Gerekli s√ºtunlarƒ± kontrol et
        required_columns = ['rawText']  # Ana metin s√ºtunu
        optional_columns = ['esasNo', 'kararNo', 'location', 'extractedDates']
        
        if 'rawText' not in df.columns:
            print(f"‚ùå 'rawText' s√ºtunu bulunamadƒ±. Mevcut s√ºtunlar: {df.columns.tolist()}")
            return []
        
        all_chunks = []
        
        print("üîÑ Semantic chunking ba≈ülƒ±yor...")
        for idx, row in df.iterrows():
            # Ana metni al
            text = row.get('rawText', '') or row.get('text', '')
            
            if not text or pd.isna(text):
                continue
            
            # Metadata hazƒ±rla
            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', '')
                #'karar_turu': row.get('karar_turu', ''),
            }
            
            # Semantic chunking yap
            chunks = self.semantic_chunk_text(str(text), metadata)
            all_chunks.extend(chunks)
            
            # Progress g√∂ster
            if (idx + 1) % 100 == 0:
                print(f"  ‚úÖ ƒ∞≈ülenen satƒ±r: {idx + 1}/{len(df)} (Toplam chunk: {len(all_chunks)})")
        
        print(f"üß© Toplam {len(all_chunks)} chunk olu≈üturuldu")
        return all_chunks
    
    def upload_to_qdrant(self, chunks: List[Dict]):
        """Chunk'larƒ± Qdrant'a y√ºkle"""
        if not chunks:
            print("‚ùå Y√ºklenecek chunk yok")
            return
        
        print(f"üöÄ {len(chunks)} chunk Qdrant'a y√ºkleniyor...")
        
        # Metinleri topla
        texts = [chunk['text'] for chunk in chunks]
        
        # Embedding'leri olu≈ütur
        print("üîÆ Embedding'ler olu≈üturuluyor...")
        embeddings = self.create_embeddings(texts)
        
        if len(embeddings) != len(chunks):
            print(f"‚ùå Embedding sayƒ±sƒ± uyumsuz: {len(embeddings)} vs {len(chunks)}")
            return
        
        # Qdrant point'leri hazƒ±rla
        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload=chunk
            )
            points.append(point)
        
        # Batch halinde y√ºkle
        batch_size = self.config.BATCH_SIZE
        print(f"üì¶ {batch_size} batch size ile y√ºkleniyor...")
        
        for i in range(0, len(points), batch_size):
            batch = points[i:i + batch_size]
            
            try:
                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch
                )
                print(f"  ‚úÖ Batch y√ºklendi: {min(i + batch_size, len(points))}/{len(points)}")
                
            except Exception as e:
                print(f"‚ùå Batch y√ºkleme hatasƒ±: {e}")
        
        print("üéâ Y√ºkleme tamamlandƒ±!")
    
    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.7) -> List[Dict]:
        """Semantic arama yap"""
        print(f"üîç Arama: '{query}'")
        
        try:
            # Query i√ßin embedding olu≈ütur
            query_response = self.cohere_client.embed(
                texts=[query],
                model=self.config.COHERE_MODEL,
                input_type="search_query"  # Arama query'si i√ßin
            )
            query_embedding = query_response.embeddings[0]
            
            # Qdrant'ta ara
            search_results = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_embedding,
                limit=limit,
                score_threshold=score_threshold
            )
            
            # Sonu√ßlarƒ± formatla
            results = []
            for point in search_results:
                results.append({
                    'score': point.score,
                    'payload': point.payload
                })
            
            print(f"üìä {len(results)} sonu√ß bulundu")
            return results
            
        except Exception as e:
            print(f"‚ùå Arama hatasƒ±: {e}")
            return []
    
    def get_collection_info(self) -> dict:
        """Koleksiyon bilgilerini al"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status
            }
        except Exception as e:
            return {"error": str(e)}

# Ana Pipeline Sƒ±nƒ±fƒ±
class YargitayPipeline:
    """Ana pipeline sƒ±nƒ±fƒ±"""
    
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config
    
    def full_pipeline(self, csv_path: str = None):
        """Tam pipeline'ƒ± √ßalƒ±≈ütƒ±r"""
        csv_path = csv_path or self.config.CSV_FILE
        
        print("üöÄ Yargƒ±tay Semantic Pipeline Ba≈ülƒ±yor")
        print("=" * 50)
        
        # 1. Baƒülantƒ±larƒ± test et
        embedding_dim = self.processor.test_cohere_connection()
        if not embedding_dim:
            return False
        
        # 2. Koleksiyon olu≈ütur
        self.processor.create_qdrant_collection(recreate=True)
        
        # 3. CSV'yi i≈üle
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("‚ùå ƒ∞≈ülenecek chunk bulunamadƒ±")
            return False
        
        # 4. Qdrant'a y√ºkle
        self.processor.upload_to_qdrant(chunks)
        
        # 5. Bilgileri g√∂ster
        info = self.processor.get_collection_info()
        print("\nüìä Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        
        return True
    
    def interactive_search(self):
        """ƒ∞nteraktif arama aray√ºz√º"""
        print("\n" + "=" * 50)
        print("üèõÔ∏è YARGITAY SEMANTƒ∞K ARAMA Sƒ∞STEMƒ∞")
        print("=" * 50)
        
        while True:
            query = input("\nüîç Arama metni (√ßƒ±kmak i√ßin 'q'): ")
            if query.lower() in ['q', 'quit', 'exit']:
                print("üëã G√∂r√º≈ü√ºr√ºz!")
                break
            
            if not query.strip():
                continue
            
            try:
                limit = int(input("üìä Ka√ß sonu√ß? (varsayƒ±lan 5): ") or "5")
            except:
                limit = 5
            
            # Arama yap
            results = self.processor.search_semantic(query, limit=limit)
            
            if not results:
                print("‚ùå Sonu√ß bulunamadƒ±")
                continue
            
            print(f"\nüìã {len(results)} sonu√ß bulundu:")
            print("-" * 60)
            
            for i, result in enumerate(results, 1):
                payload = result['payload']
                print(f"\n{i}. üìÑ Benzerlik Skoru: {result['score']:.3f}")
                print(f"   ‚öñÔ∏è Esas No: {payload.get('esas_no', 'N/A')}")
                print(f"   üìã Karar No: {payload.get('karar_no', 'N/A')}")
                print(f"   üèõÔ∏è Daire: {payload.get('daire', 'N/A')}")
                print(f"   üìÖ Tarih: {payload.get('tarih', 'N/A')}")
                print(f"   üî§ Token: {payload.get('token_count', 'N/A')}")
                print(f"   üìù Metin √ñnizleme:")
                
                text = payload.get('text', '')
                preview = text[:300] + "..." if len(text) > 300 else text
                print(f"      {preview}")
                print("-" * 60)

# Kullanƒ±m √∂rneƒüi ve main fonksiyon
def main():
    """Ana fonksiyon"""
    
    # Konfig√ºrasyon (buraya kendi bilgilerinizi yazƒ±n)
    config = Config(
        COHERE_API_KEY=str(cohere_api_key),  # Cohere API anahtarƒ±nƒ±z
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",  # CSV dosya yolunuz
        TOKEN_SIZE=384,  # Chunk boyutu
        QDRANT_URL="http://localhost:6333",  # Lokal Qdrant URL
        COLLECTION_NAME="cohere_semantic_chunks",
        DIMENSION=1024
    )
    
    # Pipeline olu≈ütur
    pipeline = YargitayPipeline(config)
    
    # Men√º g√∂ster
    while True:
        print("\n" + "=" * 50)
        print("üèõÔ∏è YARGITAY SEMANTƒ∞K CHUNK Sƒ∞STEMƒ∞")
        print("=" * 50)
        print("1. Tam pipeline √ßalƒ±≈ütƒ±r (CSV ‚Üí Semantic Chunks ‚Üí Qdrant)")
        print("2. ƒ∞nteraktif arama yap")
        print("3. Koleksiyon bilgilerini g√∂ster")
        print("4. √áƒ±kƒ±≈ü")
        
        choice = input("\nSe√ßiminiz (1-4): ")
        
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip()
            if not csv_path:
                csv_path = config.CSV_FILE
            
            success = pipeline.full_pipeline(csv_path)
            if success:
                print("‚úÖ Pipeline ba≈üarƒ±yla tamamlandƒ±!")
            else:
                print("‚ùå Pipeline hatasƒ±!")
        
        elif choice == "2":
            pipeline.interactive_search()
        
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print("\nüìä Koleksiyon Bilgileri:")
            print(json.dumps(info, indent=2, ensure_ascii=False))
        
        elif choice == "4":
            print("üëã G√∂r√º≈ü√ºr√ºz!")
            break
        
        else:
            print("‚ùå Ge√ßersiz se√ßim!")

if __name__ == "__main__":
    main()

True
‚úÖ SemChunk chunker hazƒ±r (Token boyutu: 384)
‚úÖ Cohere client hazƒ±r (embed-multilingual-v3.0)
‚úÖ Qdrant client hazƒ±r (http://localhost:6333)

üèõÔ∏è YARGITAY SEMANTƒ∞K CHUNK Sƒ∞STEMƒ∞
1. Tam pipeline √ßalƒ±≈ütƒ±r (CSV ‚Üí Semantic Chunks ‚Üí Qdrant)
2. ƒ∞nteraktif arama yap
3. Koleksiyon bilgilerini g√∂ster
4. √áƒ±kƒ±≈ü
üöÄ Yargƒ±tay Semantic Pipeline Ba≈ülƒ±yor
‚ùå Cohere baƒülantƒ± hatasƒ±: headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin', 'x-accel-expires': '0', 'x-debug-trace-id': '9e4699f7af22b34b5a38e05ab96b6167', 'x-trial-endpoint-call-limit': '100', 'x-trial-endpoint-call-remaining': '99', 'date': 'Tue, 09 Sep 2025 11:03:45 GMT', 'content-length': '373', 'x-envoy-upstream-service-time': '12', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h

### bge-m3

In [None]:
# SemChunk + BGE-M3 + Qdrant Entegrasyon
# Yargƒ±tay Kararlarƒ± i√ßin Semantic Chunking Pipeline

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Any
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# Konfig√ºrasyon
@dataclass
class Config:
    # BGE-M3 ayarlarƒ±
    BGE_MODEL_NAME: str = "BAAI/bge-m3"  # BGE-M3 model
    USE_FP16: bool = True  # Hafƒ±za optimizasyonu
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    
    # SemChunk ayarlarƒ±
    TOKEN_SIZE: int = 512  # Chunk boyutu (token)
    ENCODING_NAME: str = "cl100k_base"  # Tiktoken encoding
    
    # Qdrant ayarlarƒ±
    QDRANT_URL: str = "http://localhost:6333"  # Lokal Qdrant
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 1024  # BGE-M3 dense embedding boyutu
    
    # Dosya ayarlarƒ±
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 32  # BGE-M3 i√ßin optimize edilmi≈ü batch size

class YargitaySemanticProcessor:
    """Yargƒ±tay kararlarƒ± i√ßin semantic chunking ve vector search"""
    
    def __init__(self, config: Config):
        self.config = config
        
        # GPU/CPU kontrol√º
        if torch.cuda.is_available():
            print(f"üöÄ GPU kullanƒ±lƒ±yor: {torch.cuda.get_device_name()}")
        else:
            print("üíª CPU kullanƒ±lƒ±yor")
        
        # SemChunk chunker olu≈ütur
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)
        
        # BGE-M3 modelini y√ºkle
        print(f"üîÆ BGE-M3 modeli y√ºkleniyor... ({config.BGE_MODEL_NAME})")
        self.bge_model = BGEM3FlagModel(
            config.BGE_MODEL_NAME, 
            use_fp16=config.USE_FP16,
            device=config.DEVICE
        )
        
        # Qdrant client olu≈ütur
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)
        
        print(f"‚úÖ SemChunk chunker hazƒ±r (Token boyutu: {config.TOKEN_SIZE})")
        print(f"‚úÖ BGE-M3 model hazƒ±r ({config.BGE_MODEL_NAME})")
        print(f"‚úÖ Qdrant client hazƒ±r ({config.QDRANT_URL})")
    
    def test_bge_connection(self):
        """BGE-M3 modelini test et"""
        try:
            test_text = ["Yargƒ±tay 6. Hukuk Dairesi'nin ihtiyati tedbir kararƒ±"]
            embeddings = self.bge_model.encode(test_text)
            
            # BGE-M3'den dense embedding al
            dense_embedding = embeddings['dense_vecs'][0]
            embedding_dim = len(dense_embedding)
            
            print(f"‚úÖ BGE-M3 test ba≈üarƒ±lƒ± - Dense embedding boyutu: {embedding_dim}")
            print(f"üîç Sparse embedding mevcut: {'colbert_vecs' in embeddings}")
            return embedding_dim
        except Exception as e:
            print(f"‚ùå BGE-M3 baƒülantƒ± hatasƒ±: {e}")
            return None
    
    def create_qdrant_collection(self, recreate: bool = False):
        """Qdrant koleksiyonu olu≈ütur"""
        collection_name = self.config.COLLECTION_NAME
        
        # Koleksiyon varsa ve recreate True ise sil
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"üóëÔ∏è Eski koleksiyon silindi: {collection_name}")
            except:
                pass
        
        # Koleksiyon yoksa olu≈ütur
        try:
            collections = self.qdrant_client.get_collections().collections
            collection_names = [c.name for c in collections]
            
            if collection_name not in collection_names:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(
                        size=self.config.EMBEDDING_DIM,
                        distance=Distance.COSINE
                    )
                )
                print(f"‚úÖ Koleksiyon olu≈üturuldu: {collection_name} (Boyut: {self.config.EMBEDDING_DIM})")
            else:
                print(f"‚ÑπÔ∏è Koleksiyon zaten var: {collection_name}")
                
        except Exception as e:
            print(f"‚ùå Koleksiyon olu≈üturma hatasƒ±: {e}")
            raise
    
    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        """Metni semantic olarak chunk'lara b√∂l"""
        if not text or text.strip() == "":
            return []
        
        try:
            # SemChunk ile metni b√∂l
            chunks = self.chunker(text)
            
            result_chunks = []
            for i, chunk_text in enumerate(chunks):
                if chunk_text.strip():  # Bo≈ü chunk'larƒ± atla
                    chunk_data = {
                        'chunk_id': i,
                        'text': chunk_text.strip(),
                        'token_count': len(self.encoding.encode(chunk_text)),
                        'char_count': len(chunk_text),
                    }
                    
                    # Metadata ekle
                    if metadata:
                        chunk_data.update(metadata)
                    
                    result_chunks.append(chunk_data)
            
            return result_chunks
            
        except Exception as e:
            print(f"‚ùå Chunking hatasƒ±: {e}")
            return []
    
    def create_embeddings_bge(self, texts: List[str], batch_size: int = None) -> List[List[float]]:
        """Metinleri BGE-M3 ile embedding'e √ßevir"""
        if batch_size is None:
            batch_size = self.config.BATCH_SIZE
            
        all_embeddings = []
        
        print(f"üîÆ BGE-M3 ile {len(texts)} metin i≈üleniyor...")
        
        # BGE-M3 i√ßin batch processing
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            try:
                # BGE-M3 ile embedding olu≈ütur
                embeddings_result = self.bge_model.encode(batch_texts)
                
                # Dense embedding'leri al (1024 boyut)
                dense_embeddings = embeddings_result['dense_vecs']
                
                # List formatƒ±na √ßevir
                for embedding in dense_embeddings:
                    all_embeddings.append(embedding.tolist())
                
                print(f"  üìä BGE-M3 Embedding: {i+len(batch_texts)}/{len(texts)}")
                
                # GPU memory temizliƒüi (gerekirse)
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"‚ùå BGE-M3 Embedding hatasƒ± (batch {i//batch_size + 1}): {e}")
                # Hata durumunda sƒ±fƒ±r embedding ekle
                for _ in batch_texts:
                    all_embeddings.append([0.0] * self.config.EMBEDDING_DIM)
        
        return all_embeddings
    
    def process_csv_file(self, csv_path: str) -> List[Dict]:
        """CSV dosyasƒ±nƒ± i≈üle ve chunk'larƒ± olu≈ütur"""
        print(f"üìÑ CSV dosyasƒ± okunuyor: {csv_path}")
        
        try:
            df = pd.read_csv(csv_path)
            print(f"üìä {len(df)} satƒ±r veri y√ºklendi")
            print(f"üìã Mevcut s√ºtunlar: {df.columns.tolist()}")
        except Exception as e:
            print(f"‚ùå CSV okuma hatasƒ±: {e}")
            return []
        
        # Ana metin s√ºtununu belirle (√∂ncelik sƒ±rasƒ±na g√∂re)
        text_columns = ['rawText', 'chunk_text', 'text', 'content', 'metin']
        text_column = None
        
        for col in text_columns:
            if col in df.columns:
                text_column = col
                print(f"‚úÖ Ana metin s√ºtunu bulundu: '{col}'")
                break
        
        if not text_column:
            print(f"‚ùå Ana metin s√ºtunu bulunamadƒ±. Kontrol edilen s√ºtunlar: {text_columns}")
            return []
        
        all_chunks = []
        
        print("üîÑ Semantic chunking ba≈ülƒ±yor...")
        for idx, row in df.iterrows():
            # Ana metni al
            text = row.get(text_column, '')
            
            if not text or pd.isna(text):
                print(f"‚ö†Ô∏è Satƒ±r {idx}: Bo≈ü metin atlandƒ±")
                continue
            
            # Metadata hazƒ±rla (CSV yapƒ±nƒ±za g√∂re g√ºncellenmi≈ü)
            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            
            # Semantic chunking yap
            chunks = self.semantic_chunk_text(str(text), metadata)
            all_chunks.extend(chunks)
            
            # Progress g√∂ster
            if (idx + 1) % 5 == 0:  # Daha sƒ±k progress g√∂ster (az veri olduƒüu i√ßin)
                print(f"  ‚úÖ ƒ∞≈ülenen satƒ±r: {idx + 1}/{len(df)} (Toplam chunk: {len(all_chunks)})")
        
        print(f"üß© Toplam {len(all_chunks)} chunk olu≈üturuldu")
        return all_chunks
    
    def upload_to_qdrant(self, chunks: List[Dict]):
        """Chunk'larƒ± Qdrant'a y√ºkle"""
        if not chunks:
            print("‚ùå Y√ºklenecek chunk yok")
            return
        
        print(f"üöÄ {len(chunks)} chunk Qdrant'a y√ºkleniyor...")
        
        # Metinleri topla
        texts = [chunk['text'] for chunk in chunks]
        
        # BGE-M3 ile embedding'leri olu≈ütur
        print("üîÆ BGE-M3 embedding'ler olu≈üturuluyor...")
        embeddings = self.create_embeddings_bge(texts)
        
        if len(embeddings) != len(chunks):
            print(f"‚ùå Embedding sayƒ±sƒ± uyumsuz: {len(embeddings)} vs {len(chunks)}")
            return
        
        # Qdrant point'leri hazƒ±rla
        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload=chunk
            )
            points.append(point)
        
        # Batch halinde y√ºkle
        batch_size = self.config.BATCH_SIZE
        print(f"üì¶ {batch_size} batch size ile y√ºkleniyor...")
        
        for i in range(0, len(points), batch_size):
            batch = points[i:i + batch_size]
            
            try:
                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch
                )
                print(f"  ‚úÖ Batch y√ºklendi: {min(i + batch_size, len(points))}/{len(points)}")
                
            except Exception as e:
                print(f"‚ùå Batch y√ºkleme hatasƒ±: {e}")
        
        print("üéâ Y√ºkleme tamamlandƒ±!")
    
    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.7) -> List[Dict]:
        """BGE-M3 ile semantic arama yap"""
        print(f"üîç Arama: '{query}'")
        
        try:
            # Query'yi BGE-M3 ile vekt√∂rize et
            query_embeddings = self.bge_model.encode([query])
            query_vector = query_embeddings['dense_vecs'][0].tolist()
            
            # Qdrant'ta ara (g√ºncel query_points metodu)
            search_results = self.qdrant_client.query_points(
                collection_name=self.config.COLLECTION_NAME,
                query=query_vector,
                limit=limit,
                score_threshold=score_threshold
            )
            
            # Sonu√ßlarƒ± formatla
            results = []
            for point in search_results.points:#burda muhtemel hata verir search_results olcak verirse
                results.append({
                    'score': point.score,
                    'payload': point.payload
                })
            
            print(f"üìä {len(results)} sonu√ß bulundu")
            return results
            
        except Exception as e:
            print(f"‚ùå Arama hatasƒ±: {e}")
            return []
    
    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = 0.6) -> List[Dict]:
        """Filtreli arama yap"""
        print(f"üîç Filtreli arama: '{query}' - Filtreler: {filters}")
        
        try:
            # Query'yi BGE-M3 ile vekt√∂rize et
            query_embeddings = self.bge_model.encode([query])
            query_vector = query_embeddings['dense_vecs'][0].tolist()
            
            # Filter olu≈ütur
            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = []
                for key, value in filters.items():
                    conditions.append(FieldCondition(key=key, match=MatchValue(value=value)))
                query_filter = Filter(must=conditions)
            
            # Qdrant'ta filtreli arama yap
            search_results = self.qdrant_client.query_points(
                collection_name=self.config.COLLECTION_NAME,
                query=query_vector,
                query_filter=query_filter,
                limit=limit,
                score_threshold=score_threshold
            )
            
            # Sonu√ßlarƒ± formatla
            results = []
            for point in search_results.points:
                results.append({
                    'score': point.score,
                    'payload': point.payload
                })
            
            print(f"üìä {len(results)} filtreli sonu√ß bulundu")
            return results
            
        except Exception as e:
            print(f"‚ùå Filtreli arama hatasƒ±: {e}")
            return []
    
    def get_collection_info(self) -> dict:
        """Koleksiyon bilgilerini al"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

# Ana Pipeline Sƒ±nƒ±fƒ±
class YargitayPipeline:
    """Ana pipeline sƒ±nƒ±fƒ±"""
    
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config
    
    def full_pipeline(self, csv_path: str = None):
        """Tam pipeline'ƒ± √ßalƒ±≈ütƒ±r"""
        csv_path = csv_path or self.config.CSV_FILE
        
        print("üöÄ Yargƒ±tay BGE-M3 Semantic Pipeline Ba≈ülƒ±yor")
        print("=" * 50)
        
        # 1. BGE-M3 modelini test et
        embedding_dim = self.processor.test_bge_connection()
        if not embedding_dim:
            return False
        
        # 2. Koleksiyon olu≈ütur
        self.processor.create_qdrant_collection(recreate=True)
        
        # 3. CSV'yi i≈üle
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("‚ùå ƒ∞≈ülenecek chunk bulunamadƒ±")
            return False
        
        # 4. Qdrant'a y√ºkle
        self.processor.upload_to_qdrant(chunks)
        
        # 5. Bilgileri g√∂ster
        info = self.processor.get_collection_info()
        print("\nüìä Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        
        return True
    
    def interactive_search(self):
        """ƒ∞nteraktif arama aray√ºz√º"""
        print("\n" + "=" * 50)
        print("üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K ARAMA Sƒ∞STEMƒ∞")
        print("=" * 50)
        
        while True:
            print("\nüîç Arama Se√ßenekleri:")
            print("1. Basit arama")
            print("2. Filtreli arama")
            print("3. Ana men√ºye d√∂n")
            
            search_choice = input("Se√ßiminiz (1-3): ")
            
            if search_choice == "3":
                break
            elif search_choice not in ["1", "2"]:
                print("‚ùå Ge√ßersiz se√ßim!")
                continue
            
            query = input("\nüîç Arama metni (√ßƒ±kmak i√ßin 'q'): ")
            if query.lower() in ['q', 'quit', 'exit']:
                break
            
            if not query.strip():
                continue
            
            try:
                limit = int(input("üìä Ka√ß sonu√ß? (varsayƒ±lan 5): ") or "5")
                #threshold = float(input("üéØ Minimum benzerlik skoru? (varsayƒ±lan 0.6): ") or "0.6")
            except:
                limit = 5
                #threshold = 0.6
            
            # Arama tipini belirle
            if search_choice == "1":
                results = self.processor.search_semantic(query, limit=limit)
            else:
                # Filtreli arama
                print("\nüîß Filtre Se√ßenekleri (bo≈ü bƒ±rakabilirsiniz):")
                daire_filter = input("Daire filtresi (√∂rn: '6. Hukuk Dairesi'): ").strip()
                
                filters = {}
                if daire_filter:
                    filters['daire'] = daire_filter
                
                results = self.processor.advanced_search_with_filters(
                    query, filters=filters if filters else None, 
                    limit=limit
                )
            
            if not results:
                print("‚ùå Sonu√ß bulunamadƒ±")
                continue
            
            print(f"\nüìã {len(results)} sonu√ß bulundu:")
            print("-" * 60)
            
            for i, result in enumerate(results, 1):
                payload = result['payload']
                print(f"\n{i}. üìÑ BGE-M3 Benzerlik Skoru: {result['score']:.3f}")
                print(f"   ‚öñÔ∏è Esas No: {payload.get('esas_no', 'N/A')}")
                print(f"   üìã Karar No: {payload.get('karar_no', 'N/A')}")
                print(f"   üèõÔ∏è Daire: {payload.get('daire', 'N/A')}")
                print(f"   üìÖ Tarih: {payload.get('tarih', 'N/A')}")
                print(f"   üî§ Token: {payload.get('token_count', 'N/A')}")
                print(f"   üìù Metin √ñnizleme:")
                
                text = payload.get('text', '')
                preview = text[:300] + "..." if len(text) > 300 else text
                print(f"      {preview}")
                print("-" * 60)

# Kullanƒ±m √∂rneƒüi ve main fonksiyon
def main():
    """Ana fonksiyon"""
    
    # Konfig√ºrasyon
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,  # Chunk boyutu
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_m3_chunks",
        EMBEDDING_DIM=1024,
        BATCH_SIZE=16,  # GPU memory'ye g√∂re ayarlayƒ±n
        USE_FP16=True,
        DEVICE="cuda" if torch.cuda.is_available() else "cpu"
    )
    
    # Pipeline olu≈ütur
    pipeline = YargitayPipeline(config)
    
    # Men√º g√∂ster
    while True:
        print("\n" + "=" * 50)
        print("üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K CHUNK Sƒ∞STEMƒ∞")
        print("=" * 50)
        print("1. Tam pipeline √ßalƒ±≈ütƒ±r (CSV ‚Üí Semantic Chunks ‚Üí BGE-M3 ‚Üí Qdrant)")
        print("2. ƒ∞nteraktif arama yap")
        print("3. Koleksiyon bilgilerini g√∂ster")
        print("4. √áƒ±kƒ±≈ü")
        
        choice = input("\nSe√ßiminiz (1-4): ")
        
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip()
            if not csv_path:
                csv_path = config.CSV_FILE
            
            success = pipeline.full_pipeline(csv_path)
            if success:
                print("‚úÖ BGE-M3 Pipeline ba≈üarƒ±yla tamamlandƒ±!")
            else:
                print("‚ùå Pipeline hatasƒ±!")
        
        elif choice == "2":
            pipeline.interactive_search()
        
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print("\nüìä Koleksiyon Bilgileri:")
            print(json.dumps(info, indent=2, ensure_ascii=False))
        
        elif choice == "4":
            print("üëã G√∂r√º≈ü√ºr√ºz!")
            break
        
        else:
            print("‚ùå Ge√ßersiz se√ßim!")

if __name__ == "__main__":
    # BGE-M3 kurulumu kontrol√º
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("‚úÖ FlagEmbedding k√ºt√ºphanesi y√ºkl√º")
    except ImportError:
        print("‚ùå FlagEmbedding k√ºt√ºphanesi bulunamadƒ±!")
        print("Kurulum i√ßin: pip install FlagEmbedding")
        exit(1)
    
    main()

In [2]:
# SemChunk + BGE-M3 + Qdrant Entegrasyon
# Yargƒ±tay Kararlarƒ± i√ßin Semantic Chunking Pipeline

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Any
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# Konfig√ºrasyon
@dataclass
class Config:
    # BGE-M3 ayarlarƒ±
    BGE_MODEL_NAME: str = "BAAI/bge-m3"  # BGE-M3 model
    USE_FP16: bool = True  # Hafƒ±za optimizasyonu
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    
    # SemChunk ayarlarƒ±
    TOKEN_SIZE: int = 512  # Chunk boyutu (token)
    ENCODING_NAME: str = "cl100k_base"  # Tiktoken encoding
    
    # Qdrant ayarlarƒ±
    QDRANT_URL: str = "http://localhost:6333"  # Lokal Qdrant
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 1024  # BGE-M3 dense embedding boyutu
    
    # Dosya ayarlarƒ±
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 32  # BGE-M3 i√ßin optimize edilmi≈ü batch size

class YargitaySemanticProcessor:
    """Yargƒ±tay kararlarƒ± i√ßin semantic chunking ve vector search"""
    
    def __init__(self, config: Config):
        self.config = config
        
        # GPU/CPU kontrol√º
        if torch.cuda.is_available():
            print(f"üöÄ GPU kullanƒ±lƒ±yor: {torch.cuda.get_device_name()}")
        else:
            print("üíª CPU kullanƒ±lƒ±yor")
        
        # SemChunk chunker olu≈ütur
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)
        
        # BGE-M3 modelini y√ºkle
        print(f"üîÆ BGE-M3 modeli y√ºkleniyor... ({config.BGE_MODEL_NAME})")
        self.bge_model = BGEM3FlagModel(
            config.BGE_MODEL_NAME, 
            use_fp16=config.USE_FP16,
            device=config.DEVICE
        )
        
        # Qdrant client olu≈ütur
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)
        
        print(f"‚úÖ SemChunk chunker hazƒ±r (Token boyutu: {config.TOKEN_SIZE})")
        print(f"‚úÖ BGE-M3 model hazƒ±r ({config.BGE_MODEL_NAME})")
        print(f"‚úÖ Qdrant client hazƒ±r ({config.QDRANT_URL})")
    
    def test_bge_connection(self):
        """BGE-M3 modelini test et"""
        try:
            test_text = ["Yargƒ±tay 6. Hukuk Dairesi'nin ihtiyati tedbir kararƒ±"]
            embeddings = self.bge_model.encode(test_text)
            
            # BGE-M3'den dense embedding al
            dense_embedding = embeddings['dense_vecs'][0]
            embedding_dim = len(dense_embedding)
            
            print(f"‚úÖ BGE-M3 test ba≈üarƒ±lƒ± - Dense embedding boyutu: {embedding_dim}")
            print(f"üîç Sparse embedding mevcut: {'colbert_vecs' in embeddings}")
            return embedding_dim
        except Exception as e:
            print(f"‚ùå BGE-M3 baƒülantƒ± hatasƒ±: {e}")
            return None
    
    def create_qdrant_collection(self, recreate: bool = False):
        """Qdrant koleksiyonu olu≈ütur"""
        collection_name = self.config.COLLECTION_NAME
        
        # Koleksiyon varsa ve recreate True ise sil
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"üóëÔ∏è Eski koleksiyon silindi: {collection_name}")
            except:
                pass
        
        # Koleksiyon yoksa olu≈ütur
        try:
            collections = self.qdrant_client.get_collections().collections
            collection_names = [c.name for c in collections]
            
            if collection_name not in collection_names:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(
                        size=self.config.EMBEDDING_DIM,
                        distance=Distance.COSINE
                    )
                )
                print(f"‚úÖ Koleksiyon olu≈üturuldu: {collection_name} (Boyut: {self.config.EMBEDDING_DIM})")
            else:
                print(f"‚ÑπÔ∏è Koleksiyon zaten var: {collection_name}")
                
        except Exception as e:
            print(f"‚ùå Koleksiyon olu≈üturma hatasƒ±: {e}")
            raise
    
    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        """Metni semantic olarak chunk'lara b√∂l"""
        if not text or text.strip() == "":
            return []
        
        try:
            # SemChunk ile metni b√∂l
            chunks = self.chunker(text)
            
            result_chunks = []
            for i, chunk_text in enumerate(chunks):
                if chunk_text.strip():  # Bo≈ü chunk'larƒ± atla
                    chunk_data = {
                        'chunk_id': i,
                        'text': chunk_text.strip(),
                        'token_count': len(self.encoding.encode(chunk_text)),
                        'char_count': len(chunk_text),
                    }
                    
                    # Metadata ekle
                    if metadata:
                        chunk_data.update(metadata)
                    
                    result_chunks.append(chunk_data)
            
            return result_chunks
            
        except Exception as e:
            print(f"‚ùå Chunking hatasƒ±: {e}")
            return []
    
    def create_embeddings_bge(self, texts: List[str], batch_size: int = None) -> List[List[float]]:
        """Metinleri BGE-M3 ile embedding'e √ßevir"""
        if batch_size is None:
            batch_size = self.config.BATCH_SIZE
            
        all_embeddings = []
        
        print(f"üîÆ BGE-M3 ile {len(texts)} metin i≈üleniyor...")
        
        # BGE-M3 i√ßin batch processing
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            try:
                # BGE-M3 ile embedding olu≈ütur
                embeddings_result = self.bge_model.encode(batch_texts)
                
                # Dense embedding'leri al (1024 boyut)
                dense_embeddings = embeddings_result['dense_vecs']
                
                # List formatƒ±na √ßevir
                for embedding in dense_embeddings:
                    all_embeddings.append(embedding.tolist())
                
                print(f"  üìä BGE-M3 Embedding: {i+len(batch_texts)}/{len(texts)}")
                
                # GPU memory temizliƒüi (gerekirse)
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"‚ùå BGE-M3 Embedding hatasƒ± (batch {i//batch_size + 1}): {e}")
                # Hata durumunda sƒ±fƒ±r embedding ekle
                for _ in batch_texts:
                    all_embeddings.append([0.0] * self.config.EMBEDDING_DIM)
        
        return all_embeddings
    
    def process_csv_file(self, csv_path: str) -> List[Dict]:
        """CSV dosyasƒ±nƒ± i≈üle ve chunk'larƒ± olu≈ütur"""
        print(f"üìÑ CSV dosyasƒ± okunuyor: {csv_path}")
        
        try:
            df = pd.read_csv(csv_path)
            print(f"üìä {len(df)} satƒ±r veri y√ºklendi")
            print(f"üìã Mevcut s√ºtunlar: {df.columns.tolist()}")
        except Exception as e:
            print(f"‚ùå CSV okuma hatasƒ±: {e}")
            return []
        
        # Ana metin s√ºtununu belirle (√∂ncelik sƒ±rasƒ±na g√∂re)
        text_columns = ['rawText', 'chunk_text', 'text', 'content', 'metin']
        text_column = None
        
        for col in text_columns:
            if col in df.columns:
                text_column = col
                print(f"‚úÖ Ana metin s√ºtunu bulundu: '{col}'")
                break
        
        if not text_column:
            print(f"‚ùå Ana metin s√ºtunu bulunamadƒ±. Kontrol edilen s√ºtunlar: {text_columns}")
            return []
        
        all_chunks = []
        
        print("üîÑ Semantic chunking ba≈ülƒ±yor...")
        for idx, row in df.iterrows():
            # Ana metni al
            text = row.get(text_column, '')
            
            if not text or pd.isna(text):
                print(f"‚ö†Ô∏è Satƒ±r {idx}: Bo≈ü metin atlandƒ±")
                continue
            
            # Metadata hazƒ±rla (CSV yapƒ±nƒ±za g√∂re g√ºncellenmi≈ü)
            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', '') or row.get('esas_no', ''),
                'karar_no': row.get('kararNo', '') or row.get('karar_no', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', '') or row.get('dates', ''),
                'esas_no_num': row.get('esasNo_num', ''),
                'esas_no_tip': row.get('esasNo_tip', ''),
                'karar_no_num': row.get('kararNo_num', ''),
                'karar_no_tip': row.get('kararNo_tip', ''),
                'document_id': row.get('_id', ''),
            }
            
            # Semantic chunking yap
            chunks = self.semantic_chunk_text(str(text), metadata)
            all_chunks.extend(chunks)
            
            # Progress g√∂ster
            if (idx + 1) % 5 == 0:  # Daha sƒ±k progress g√∂ster (az veri olduƒüu i√ßin)
                print(f"  ‚úÖ ƒ∞≈ülenen satƒ±r: {idx + 1}/{len(df)} (Toplam chunk: {len(all_chunks)})")
        
        print(f"üß© Toplam {len(all_chunks)} chunk olu≈üturuldu")
        return all_chunks
    
    def upload_to_qdrant(self, chunks: List[Dict]):
        """Chunk'larƒ± Qdrant'a y√ºkle"""
        if not chunks:
            print("‚ùå Y√ºklenecek chunk yok")
            return
        
        print(f"üöÄ {len(chunks)} chunk Qdrant'a y√ºkleniyor...")
        
        # Metinleri topla
        texts = [chunk['text'] for chunk in chunks]
        
        # BGE-M3 ile embedding'leri olu≈ütur
        print("üîÆ BGE-M3 embedding'ler olu≈üturuluyor...")
        embeddings = self.create_embeddings_bge(texts)
        
        if len(embeddings) != len(chunks):
            print(f"‚ùå Embedding sayƒ±sƒ± uyumsuz: {len(embeddings)} vs {len(chunks)}")
            return
        
        # Qdrant point'leri hazƒ±rla
        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload=chunk
            )
            points.append(point)
        
        # Batch halinde y√ºkle
        batch_size = self.config.BATCH_SIZE
        print(f"üì¶ {batch_size} batch size ile y√ºkleniyor...")
        
        for i in range(0, len(points), batch_size):
            batch = points[i:i + batch_size]
            
            try:
                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch
                )
                print(f"  ‚úÖ Batch y√ºklendi: {min(i + batch_size, len(points))}/{len(points)}")
                
            except Exception as e:
                print(f"‚ùå Batch y√ºkleme hatasƒ±: {e}")
        
        print("üéâ Y√ºkleme tamamlandƒ±!")
    
    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.6) -> List[Dict]:
        """BGE-M3 ile semantic arama yap"""
        print(f"üîç Arama: '{query}'")
        
        try:
            # Query'yi BGE-M3 ile vekt√∂rize et
            query_embeddings = self.bge_model.encode([query])
            query_vector = query_embeddings['dense_vecs'][0].tolist()
            
            # Qdrant'ta ara (g√ºncel query_points metodu)
            search_results = self.qdrant_client.query_points(
                collection_name=self.config.COLLECTION_NAME,
                query=query_vector,
                limit=limit,
                score_threshold=score_threshold
            )
            
            # Sonu√ßlarƒ± formatla
            results = []
            for point in search_results.points:
                results.append({
                    'score': point.score,
                    'payload': point.payload
                })
            
            print(f"üìä {len(results)} sonu√ß bulundu")
            return results
            
        except Exception as e:
            print(f"‚ùå Arama hatasƒ±: {e}")
            return []
    
    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = 0.6) -> List[Dict]:
        """Filtreli arama yap"""
        print(f"üîç Filtreli arama: '{query}' - Filtreler: {filters}")
        
        try:
            # Query'yi BGE-M3 ile vekt√∂rize et
            query_embeddings = self.bge_model.encode([query])
            query_vector = query_embeddings['dense_vecs'][0].tolist()
            
            # Filter olu≈ütur
            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = []
                for key, value in filters.items():
                    conditions.append(FieldCondition(key=key, match=MatchValue(value=value)))
                query_filter = Filter(must=conditions)
            
            # Qdrant'ta filtreli arama yap
            search_results = self.qdrant_client.query_points(
                collection_name=self.config.COLLECTION_NAME,
                query=query_vector,
                query_filter=query_filter,
                limit=limit,
                score_threshold=score_threshold
            )
            
            # Sonu√ßlarƒ± formatla
            results = []
            for point in search_results.points:
                results.append({
                    'score': point.score,
                    'payload': point.payload
                })
            
            print(f"üìä {len(results)} filtreli sonu√ß bulundu")
            return results
            
        except Exception as e:
            print(f"‚ùå Filtreli arama hatasƒ±: {e}")
            return []
    
    def get_collection_info(self) -> dict:
        """Koleksiyon bilgilerini al"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

# Ana Pipeline Sƒ±nƒ±fƒ±
class YargitayPipeline:
    """Ana pipeline sƒ±nƒ±fƒ±"""
    
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config
    
    def full_pipeline(self, csv_path: str = None):
        """Tam pipeline'ƒ± √ßalƒ±≈ütƒ±r"""
        csv_path = csv_path or self.config.CSV_FILE
        
        print("üöÄ Yargƒ±tay BGE-M3 Semantic Pipeline Ba≈ülƒ±yor")
        print("=" * 50)
        
        # 1. BGE-M3 modelini test et
        embedding_dim = self.processor.test_bge_connection()
        if not embedding_dim:
            return False
        
        # 2. Koleksiyon olu≈ütur
        self.processor.create_qdrant_collection(recreate=True)
        
        # 3. CSV'yi i≈üle
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("‚ùå ƒ∞≈ülenecek chunk bulunamadƒ±")
            return False
        
        # 4. Qdrant'a y√ºkle
        self.processor.upload_to_qdrant(chunks)
        
        # 5. Bilgileri g√∂ster
        info = self.processor.get_collection_info()
        print("\nüìä Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        
        return True
    
    def interactive_search(self):
        """ƒ∞nteraktif arama aray√ºz√º"""
        print("\n" + "=" * 50)
        print("üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K ARAMA Sƒ∞STEMƒ∞")
        print("=" * 50)
        
        while True:
            print("\nüîç Arama Se√ßenekleri:")
            print("1. Basit arama")
            print("2. Filtreli arama")
            print("3. Ana men√ºye d√∂n")
            
            search_choice = input("Se√ßiminiz (1-3): ")
            
            if search_choice == "3":
                break
            elif search_choice not in ["1", "2"]:
                print("‚ùå Ge√ßersiz se√ßim!")
                continue
            
            query = input("\nüîç Arama metni (√ßƒ±kmak i√ßin 'q'): ")
            if query.lower() in ['q', 'quit', 'exit']:
                break
            
            if not query.strip():
                continue
            
            try:
                limit = int(input("üìä Ka√ß sonu√ß? (varsayƒ±lan 5): ") or "5")
                threshold = float(input("üéØ Minimum benzerlik skoru? (varsayƒ±lan 0.6): ") or "0.6")
            except:
                limit = 5
                threshold = 0.6
            
            # Arama tipini belirle
            if search_choice == "1":
                results = self.processor.search_semantic(query, limit=limit, score_threshold=threshold)
            else:
                # Filtreli arama
                print("\nüîß Filtre Se√ßenekleri (bo≈ü bƒ±rakabilirsiniz):")
                daire_filter = input("Daire filtresi (√∂rn: '6. Hukuk Dairesi'): ").strip()
                
                filters = {}
                if daire_filter:
                    filters['daire'] = daire_filter
                
                results = self.processor.advanced_search_with_filters(
                    query, filters=filters if filters else None, 
                    limit=limit, score_threshold=threshold
                )
            
            if not results:
                print("‚ùå Sonu√ß bulunamadƒ±")
                continue
            
            print(f"\nüìã {len(results)} sonu√ß bulundu:")
            print("-" * 60)
            
            for i, result in enumerate(results, 1):
                payload = result['payload']
                print(f"\n{i}. üìÑ BGE-M3 Benzerlik Skoru: {result['score']:.3f}")
                print(f"   ‚öñÔ∏è Esas No: {payload.get('esas_no', 'N/A')}")
                print(f"   üìã Karar No: {payload.get('karar_no', 'N/A')}")
                print(f"   üèõÔ∏è Daire: {payload.get('daire', 'N/A')}")
                print(f"   üìÖ Tarih: {payload.get('tarih', 'N/A')}")
                print(f"   üî§ Token: {payload.get('token_count', 'N/A')}")
                print(f"   üìù Metin √ñnizleme:")
                
                text = payload.get('text', '')
                preview = text[:300] + "..." if len(text) > 300 else text
                print(f"      {preview}")
                print("-" * 60)

# Kullanƒ±m √∂rneƒüi ve main fonksiyon
def main():
    """Ana fonksiyon"""
    
    # Konfig√ºrasyon
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,  # Chunk boyutu
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_m3_chunks",
        EMBEDDING_DIM=1024,
        BATCH_SIZE=16,  # GPU memory'ye g√∂re ayarlayƒ±n
        USE_FP16=True,
        DEVICE="cuda" if torch.cuda.is_available() else "cpu"
    )
    
    # Pipeline olu≈ütur
    pipeline = YargitayPipeline(config)
    
    # Men√º g√∂ster
    while True:
        print("\n" + "=" * 50)
        print("üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K CHUNK Sƒ∞STEMƒ∞")
        print("=" * 50)
        print("1. Tam pipeline √ßalƒ±≈ütƒ±r (CSV ‚Üí Semantic Chunks ‚Üí BGE-M3 ‚Üí Qdrant)")
        print("2. ƒ∞nteraktif arama yap")
        print("3. Koleksiyon bilgilerini g√∂ster")
        print("4. √áƒ±kƒ±≈ü")
        
        choice = input("\nSe√ßiminiz (1-4): ")
        
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip()
            if not csv_path:
                csv_path = config.CSV_FILE
            
            success = pipeline.full_pipeline(csv_path)
            if success:
                print("‚úÖ BGE-M3 Pipeline ba≈üarƒ±yla tamamlandƒ±!")
            else:
                print("‚ùå Pipeline hatasƒ±!")
        
        elif choice == "2":
            pipeline.interactive_search()
        
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print("\nüìä Koleksiyon Bilgileri:")
            print(json.dumps(info, indent=2, ensure_ascii=False))
        
        elif choice == "4":
            print("üëã G√∂r√º≈ü√ºr√ºz!")
            break
        
        else:
            print("‚ùå Ge√ßersiz se√ßim!")

if __name__ == "__main__":
    # BGE-M3 kurulumu kontrol√º
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("‚úÖ FlagEmbedding k√ºt√ºphanesi y√ºkl√º")
    except ImportError:
        print("‚ùå FlagEmbedding k√ºt√ºphanesi bulunamadƒ±!")
        print("Kurulum i√ßin: pip install FlagEmbedding")
        exit(1)
    
    main()

True
‚úÖ FlagEmbedding k√ºt√ºphanesi y√ºkl√º
üöÄ GPU kullanƒ±lƒ±yor: NVIDIA RTX A6000
üîÆ BGE-M3 modeli y√ºkleniyor... (BAAI/bge-m3)


Fetching 30 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:00<00:00, 50091.21it/s]


‚úÖ SemChunk chunker hazƒ±r (Token boyutu: 512)
‚úÖ BGE-M3 model hazƒ±r (BAAI/bge-m3)
‚úÖ Qdrant client hazƒ±r (http://localhost:6333)

üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K CHUNK Sƒ∞STEMƒ∞
1. Tam pipeline √ßalƒ±≈ütƒ±r (CSV ‚Üí Semantic Chunks ‚Üí BGE-M3 ‚Üí Qdrant)
2. ƒ∞nteraktif arama yap
3. Koleksiyon bilgilerini g√∂ster
4. √áƒ±kƒ±≈ü

üèõÔ∏è YARGITAY BGE-M3 SEMANTƒ∞K ARAMA Sƒ∞STEMƒ∞

üîç Arama Se√ßenekleri:
1. Basit arama
2. Filtreli arama
3. Ana men√ºye d√∂n
üîç Arama: 'ihtiyati tedbir tazminat'


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


üìä 5 sonu√ß bulundu

üìã 5 sonu√ß bulundu:
------------------------------------------------------------

1. üìÑ BGE-M3 Benzerlik Skoru: 0.668
   ‚öñÔ∏è Esas No: 2022/3281 E.
   üìã Karar No: 2024/117 K.
   üèõÔ∏è Daire: 6.HukukDairesisi
   üìÖ Tarih: 30.12.2011,26.06.2009,25.12.2009,04.01.2010,05.01.2010,12.01.2010,04.06.2013,25.12.2009,31.12.2009,04.01.2010,05.01.2010,08.01.2010,13.01.2010,05.01.2010,14.01.2010,05.01.2010,14.01.2010,11.01.2024
   üî§ Token: 413
   üìù Metin √ñnizleme:
      maddesi, ihtiyati tedbir kararƒ±nƒ±n haksƒ±z olduƒüunun belirlenmesi halinde tedbir kararƒ± y√ºz√ºnden uƒüranƒ±lan zararƒ±n tazminini d√ºzenlediƒüini, ihtiyati tedbir kararƒ±nƒ± icra ettiren tarafƒ±n yasal s√ºrede dava a√ßmamasƒ± halinde ihtiyati tedbirin haksƒ±z konulduƒüunun kabul√º gerektiƒüi, kaldƒ± ki s√ºresinde dava ...
------------------------------------------------------------

2. üìÑ BGE-M3 Benzerlik Skoru: 0.649
   ‚öñÔ∏è Esas No: 2022/3281 E.
   üìã Karar No: 2024/117 K.
   