In [None]:
# SemChunk + BGE-M3 + Qdrant Entegrasyon
# Yargıtay Kararları için Semantic Chunking Pipeline

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Any
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from torch import nn 

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

class EmbedReducer(nn.Module):
    def __init__(self, input_dim=1024, output_dim=512):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)
    
# Konfigürasyon
@dataclass
class Config:
    # BGE-M3 ayarları
    BGE_MODEL_NAME: str = "BAAI/bge-m3"  # BGE-M3 model
    USE_FP16: bool = True  # Hafıza optimizasyonu
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    
    # SemChunk ayarları
    TOKEN_SIZE: int = 512  # Chunk boyutu (token)
    ENCODING_NAME: str = "cl100k_base"  # Tiktoken encoding
    
    # Qdrant ayarları
    QDRANT_URL: str = "http://localhost:6333"  # Lokal Qdrant
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512  # BGE-M3 dense embedding boyutu
    
    # Dosya ayarları
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100  # BGE-M3 için optimize edilmiş batch size
    db_batch=256
class YargitaySemanticProcessor:
    """Yargıtay kararları için semantic chunking ve vector search"""
    
    def __init__(self, config: Config):
        self.config = config
        
        # GPU/CPU kontrolü
        if torch.cuda.is_available():
            print(f"🚀 GPU kullanılıyor: {torch.cuda.get_device_name()}")
        else:
            print("💻 CPU kullanılıyor")
        
        # SemChunk chunker oluştur
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)
        
        # BGE-M3 modelini yükle
        print(f"🔮 BGE-M3 modeli yükleniyor... ({config.BGE_MODEL_NAME})")
        self.bge_model = BGEM3FlagModel(
            config.BGE_MODEL_NAME, 
            use_fp16=config.USE_FP16,
            device=config.DEVICE
        )
        self.reducer = EmbedReducer(1024, 512).to('cuda:0')
        # Qdrant client oluştur
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)
        
        print(f"✅ SemChunk chunker hazır (Token boyutu: {config.TOKEN_SIZE})")
        print(f"✅ BGE-M3 model hazır ({config.BGE_MODEL_NAME})")
        print(f"✅ Qdrant client hazır ({config.QDRANT_URL})")
    
    def test_bge_connection(self):
        """BGE-M3 modelini test et"""
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            embeddings = self.bge_model.encode(test_text)
            
            # BGE-M3'den dense embedding al
            dense_embedding = embeddings['dense_vecs'][0]
            embedding_dim = len(dense_embedding)
            
            print(f"✅ BGE-M3 test başarılı - Dense embedding boyutu: {embedding_dim}")
            print(f"🔍 Sparse embedding mevcut: {'colbert_vecs' in embeddings}")
            return embedding_dim
        except Exception as e:
            print(f"❌ BGE-M3 bağlantı hatası: {e}")
            return None
    
    def create_qdrant_collection(self, recreate: bool = False):
        """Qdrant koleksiyonu oluştur"""
        collection_name = self.config.COLLECTION_NAME
        
        # Koleksiyon varsa ve recreate True ise sil
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except:
                pass
        
        # Koleksiyon yoksa oluştur
        try:
            collections = self.qdrant_client.get_collections().collections
            collection_names = [c.name for c in collections]
            
            if collection_name not in collection_names:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(
                        size=self.config.EMBEDDING_DIM,
                        distance=Distance.COSINE
                    )
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} (Boyut: {self.config.EMBEDDING_DIM})")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
                
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise
    
    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        """Metni semantic olarak chunk'lara böl"""
        if not text or text.strip() == "":
            return []
        
        try:
            # SemChunk ile metni böl
            chunks = self.chunker(text)
            
            result_chunks = []
            for i, chunk_text in enumerate(chunks):
                if chunk_text.strip():  # Boş chunk'ları atla
                    chunk_data = {
                        'chunk_id': i,
                        'text': chunk_text.strip(),
                        'token_count': len(self.encoding.encode(chunk_text)),
                        'char_count': len(chunk_text),
                    }
                    
                    # Metadata ekle
                    if metadata:
                        chunk_data.update(metadata)
                    
                    result_chunks.append(chunk_data)
            
            return result_chunks
            
        except Exception as e:
            print(f"❌ Chunking hatası: {e}")
            return []
    
    def create_embeddings_bge(self, texts: List[str], batch_size: int = 100, target_dim: int = 512) -> List[List[float]]:
        
        if batch_size is None:
            batch_size = self.config.BATCH_SIZE
            
        all_embeddings = []
        total = len(texts)
        print(f"🔮 BGE-M3 ile {len(texts)} metin işleniyor...")
        
        # BGE-M3 için batch processing
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            try:
                # BGE-M3 ile embedding oluştur
                embeddings_result = self.bge_model.encode(batch_texts)
                
                # dense_vecs key kontrolü
                if isinstance(embeddings_result, dict) and 'dense_vecs' in embeddings_result:
                    dense_embeddings = embeddings_result['dense_vecs']
                else:
                    dense_embeddings = embeddings_result  # direkt tensor veya list ise
                
                # Tensor'a çevir ve GPU'ya taşı
                if not isinstance(dense_embeddings, torch.Tensor):
                    dense_embeddings = torch.tensor(dense_embeddings, device=self.config.DEVICE, dtype=torch.float32)
                else:
                    dense_embeddings = dense_embeddings.to(self.config.DEVICE)
                
                # Reducer ile boyut küçültme
                reduced_vector = self.reducer(dense_embeddings)
                
                # Listeye çevir
                for embedding in reduced_vector:
                    all_embeddings.append(embedding.detach().cpu().tolist())
                
                print(f"  📊 BGE-M3 Embedding: {i+len(batch_texts)}/{len(texts)}")
                
                # GPU memory temizliği (gerekirse)
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"❌ BGE-M3 Embedding hatası (batch {i//batch_size + 1}): {e}")
                # Hata durumunda sıfır embedding ekle
                for _ in batch_texts:
                    all_embeddings.append([0.0] * self.config.EMBEDDING_DIM)
        
        return all_embeddings

    
    def process_csv_file(self, csv_path: str) -> List[Dict]:
        """CSV dosyasını işle ve chunk'ları oluştur"""
        print(f"📄 CSV dosyası okunuyor: {csv_path}")
        
        try:
            df = pd.read_csv(csv_path)
            print(f"📊 {len(df)} satır veri yüklendi")
            print(f"📋 Mevcut sütunlar: {df.columns.tolist()}")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []
        
        # Ana metin sütununu belirle (öncelik sırasına göre)
        text_columns = ['rawText', 'chunk_text', 'text', 'content', 'metin']
        text_column = None
        
        for col in text_columns:
            if col in df.columns:
                text_column = col
                print(f"✅ Ana metin sütunu bulundu: '{col}'")
                break
        
        if not text_column:
            print(f"❌ Ana metin sütunu bulunamadı. Kontrol edilen sütunlar: {text_columns}")
            return []
        
        all_chunks = []
        
        print("🔄 Semantic chunking başlıyor...")
        for idx, row in df.iterrows():
            # Ana metni al
            text = row.get(text_column, '')
            
            if not text or pd.isna(text):
                print(f"⚠️ Satır {idx}: Boş metin atlandı")
                continue
            
            # Metadata hazırla (CSV yapınıza göre güncellenmiş)
            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            
            # Semantic chunking yap
            chunks = self.semantic_chunk_text(str(text), metadata)
            all_chunks.extend(chunks)
            
            # Progress göster
            if (idx + 1) % 5 == 0:  # Daha sık progress göster (az veri olduğu için)
                print(f"  ✅ İşlenen satır: {idx + 1}/{len(df)} (Toplam chunk: {len(all_chunks)})")
        
        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks
    
    def upload_to_qdrant(self, chunks: List[Dict]):
        """Chunk'ları Qdrant'a yükle"""
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return
        
        print(f"🚀 {len(chunks)} chunk Qdrant'a yükleniyor...")
        
        # Metinleri topla
        texts = [chunk['text'] for chunk in chunks]
        
        # BGE-M3 ile embedding'leri oluştur
        print("🔮 BGE-M3 embedding'ler oluşturuluyor...")
        embeddings = self.create_embeddings_bge(texts)
        
        if len(embeddings) != len(chunks):
            print(f"❌ Embedding sayısı uyumsuz: {len(embeddings)} vs {len(chunks)}")
            return
        
        # Qdrant point'leri hazırla
        points = []
        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload=chunk
            )
            points.append(point)
        
        # Batch halinde yükle
        batch_size = 256
        print(f"📦 {batch_size} batch size ile yükleniyor...")
        
        for i in range(0, len(points), batch_size):
            batch = points[i:i + batch_size]
            
            try:
                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch
                )
                print(f"  ✅ Batch yüklendi: {min(i + batch_size, len(points))}/{len(points)}")
                
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")
        
        print("🎉 Yükleme tamamlandı!")
    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        """Düzeltilmiş search metodu - 512 dimension uyumlu"""

        try:
            # Query embedding çıkar (BGE-M3 dense embedding, 1024 boyut)
            query_embedding_result = self.bge_model.encode([query])
            
            # dense_vecs key kontrolü
            if isinstance(query_embedding_result, dict) and 'dense_vecs' in query_embedding_result:
                query_embedding = query_embedding_result['dense_vecs']
            else:
                query_embedding = query_embedding_result
            
            # Tensor'a çevir ve GPU'ya taşı
            if not isinstance(query_embedding, torch.Tensor):
                query_embedding = torch.tensor(query_embedding, device=self.config.DEVICE, dtype=torch.float32)
            else:
                query_embedding = query_embedding.to(self.config.DEVICE)
            
            query_embedding = query_embedding.clone().detach()
            
            # Boyutu 512'ye düşür (reducer)
            with torch.no_grad():
                reduced_query_embedding = self.reducer(query_embedding)
            
            # CPU'ya taşı ve listeye çevir
            query_vector = reduced_query_embedding[0].cpu().numpy().tolist()
            print(f"🔍 Query vector boyutu: {len(query_vector)} (hedef: {self.config.EMBEDDING_DIM})")
            
            # Qdrant araması
            search_results = self.qdrant_client.search(
                collection_name=self.config.COLLECTION_NAME,
                query_vector=query_vector,
                limit=limit,
                score_threshold=score_threshold
            )
            
            # Sonuçları formatla
            results = [{'score': p.score, 'payload': p.payload} for p in search_results]
            print(f"📊 {len(results)} sonuç bulundu")
            return results
        
        except Exception as e:
            print(f"❌ Arama hatası: {e}")
            return []

    # def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.7) -> List[Dict]:
    #     """BGE-M3 ile semantic arama yap"""
    #     print(f"🔍 Arama: '{query}'")
        
    #     try:
    #         # Query'yi BGE-M3 ile vektörize et
    #         query_embeddings = self.bge_model.encode([query]).to('cuda:0')
    #         query_vector = query_embeddings['dense_vecs'][0].tolist()
            
    #         # Qdrant'ta ara (güncel query_points metodu)
    #         search_results = self.qdrant_client.query_points(
    #             collection_name=self.config.COLLECTION_NAME,
    #             query=query_vector,
    #             limit=limit,
    #             score_threshold=score_threshold
    #         )
            
    #         # Sonuçları formatla
    #         results = []
    #         for point in search_results.points:#burda muhtemel hata verir search_results olcak verirse
    #             results.append({
    #                 'score': point.score,
    #                 'payload': point.payload
    #             })
            
    #         print(f"📊 {len(results)} sonuç bulundu")
    #         return results
            
    #     except Exception as e:
    #         print(f"❌ Arama hatası: {e}")
    #         return []
    
    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = 0.6) -> List[Dict]:
        """Filtreli arama yap"""
        print(f"🔍 Filtreli arama: '{query}' - Filtreler: {filters}")
        
        try:
            # Query'yi BGE-M3 ile vektörize et
            query_embeddings = self.bge_model.encode([query])
            query_vector = query_embeddings['dense_vecs'][0].tolist()
            
            # Filter oluştur
            query_filter = None
            if filters:
                from qdrant_client.models import Filter, FieldCondition, MatchValue
                conditions = []
                for key, value in filters.items():
                    conditions.append(FieldCondition(key=key, match=MatchValue(value=value)))
                query_filter = Filter(must=conditions)
            
            # Qdrant'ta filtreli arama yap
            search_results = self.qdrant_client.query_points(
                collection_name=self.config.COLLECTION_NAME,
                query=query_vector,
                query_filter=query_filter,
                limit=limit,
                score_threshold=score_threshold
            )
            
            # Sonuçları formatla
            results = []
            for point in search_results.points:
                results.append({
                    'score': point.score,
                    'payload': point.payload
                })
            
            print(f"📊 {len(results)} filtreli sonuç bulundu")
            return results
            
        except Exception as e:
            print(f"❌ Filtreli arama hatası: {e}")
            return []
    
    def get_collection_info(self) -> dict:
        """Koleksiyon bilgilerini al"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status,
                "embedding_model": "BGE-M3",
                "embedding_dim": self.config.EMBEDDING_DIM
            }
        except Exception as e:
            return {"error": str(e)}

# Ana Pipeline Sınıfı
class YargitayPipeline:
    """Ana pipeline sınıfı"""
    
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config
    
    def full_pipeline(self, csv_path: str = None):
        """Tam pipeline'ı çalıştır"""
        csv_path = csv_path or self.config.CSV_FILE
        
        print("🚀 Yargıtay BGE-M3 Semantic Pipeline Başlıyor")
        print("=" * 50)
        
        # 1. BGE-M3 modelini test et
        embedding_dim = self.processor.test_bge_connection()
        if not embedding_dim:
            return False
        
        # 2. Koleksiyon oluştur
        self.processor.create_qdrant_collection(recreate=True)
        
        # 3. CSV'yi işle
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ İşlenecek chunk bulunamadı")
            return False
        
        # 4. Qdrant'a yükle
        self.processor.upload_to_qdrant(chunks)
        
        # 5. Bilgileri göster
        info = self.processor.get_collection_info()
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        
        return True
    
    def interactive_search(self):
        """İnteraktif arama arayüzü"""
        print("\n" + "=" * 50)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK ARAMA SİSTEMİ")
        print("=" * 50)
        
        while True:
            print("\n🔍 Arama Seçenekleri:")
            print("1. Basit arama")
            print("2. Filtreli arama")
            print("3. Ana menüye dön")
            
            search_choice = input("Seçiminiz (1-3): ")
            
            if search_choice == "3":
                break
            elif search_choice not in ["1", "2"]:
                print("❌ Geçersiz seçim!")
                continue
            
            query = input("\n🔍 Arama metni (çıkmak için 'q'): ")
            if query.lower() in ['q', 'quit', 'exit']:
                break
            
            if not query.strip():
                continue
            
            try:
                limit = int(input("📊 Kaç sonuç? (varsayılan 5): ") or "5")
                #threshold = float(input("🎯 Minimum benzerlik skoru? (varsayılan 0.6): ") or "0.6")
            except:
                limit = 5
                #threshold = 0.6
            
            # Arama tipini belirle
            if search_choice == "1":
                results = self.processor.search_semantic(query, limit=limit)
            else:
                # Filtreli arama
                print("\n🔧 Filtre Seçenekleri (boş bırakabilirsiniz):")
                daire_filter = input("Daire filtresi (örn: '6. Hukuk Dairesi'): ").strip()
                
                filters = {}
                if daire_filter:
                    filters['daire'] = daire_filter
                
                results = self.processor.advanced_search_with_filters(
                    query, filters=filters if filters else None, 
                    limit=limit
                )
            
            if not results:
                print("❌ Sonuç bulunamadı")
                continue
            
            print(f"\n📋 {len(results)} sonuç bulundu:")
            print("-" * 60)
            
            for i, result in enumerate(results, 1):
                payload = result['payload']
                print(f"\n{i}. 📄 BGE-M3 Benzerlik Skoru: {result['score']:.3f}")
                print(f"   ⚖️ Esas No: {payload.get('esas_no', 'N/A')}")
                print(f"   📋 Karar No: {payload.get('karar_no', 'N/A')}")
                print(f"   🏛️ Daire: {payload.get('daire', 'N/A')}")
                print(f"   📅 Tarih: {payload.get('tarih', 'N/A')}")
                print(f"   🔤 Token: {payload.get('token_count', 'N/A')}")
                print(f"   📝 Metin Önizleme:")
                
                text = payload.get('text', '')
                preview = text[:300] + "..." if len(text) > 300 else text
                print(f"      {preview}")
                print("-" * 60)

# Kullanım örneği ve main fonksiyon
def main():
    """Ana fonksiyon"""
    
    # Konfigürasyon
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,  # Chunk boyutu
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_m3_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100,  # GPU memory'ye göre ayarlayın
        USE_FP16=True,
        DEVICE="cuda" if torch.cuda.is_available() else "cpu"
    )
    
    # Pipeline oluştur
    pipeline = YargitayPipeline(config)
    
    # Menü göster
    while True:
        print("\n" + "=" * 50)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK CHUNK SİSTEMİ")
        print("=" * 50)
        print("1. Tam pipeline çalıştır (CSV → Semantic Chunks → BGE-M3 → Qdrant)")
        print("2. İnteraktif arama yap")
        print("3. Koleksiyon bilgilerini göster")
        print("4. Çıkış")
        
        choice = input("\nSeçiminiz (1-4): ")
        
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip()
            if not csv_path:
                csv_path = config.CSV_FILE
            
            success = pipeline.full_pipeline(csv_path)
            if success:
                print("✅ BGE-M3 Pipeline başarıyla tamamlandı!")
            else:
                print("❌ Pipeline hatası!")
        
        elif choice == "2":
            pipeline.interactive_search()
        
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print("\n📊 Koleksiyon Bilgileri:")
            print(json.dumps(info, indent=2, ensure_ascii=False))
        
        elif choice == "4":
            print("👋 Görüşürüz!")
            break
        
        else:
            print("❌ Geçersiz seçim!")

if __name__ == "__main__":
    # BGE-M3 kurulumu kontrolü
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("✅ FlagEmbedding kütüphanesi yüklü")
    except ImportError:
        print("❌ FlagEmbedding kütüphanesi bulunamadı!")
        print("Kurulum için: pip install FlagEmbedding")
        exit(1)
    
    main()

True
✅ FlagEmbedding kütüphanesi yüklü
🚀 GPU kullanılıyor: NVIDIA RTX A6000
🔮 BGE-M3 modeli yükleniyor... (BAAI/bge-m3)


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 243383.21it/s]


✅ SemChunk chunker hazır (Token boyutu: 512)
✅ BGE-M3 model hazır (BAAI/bge-m3)
✅ Qdrant client hazır (http://localhost:6333)

🏛️ YARGITAY BGE-M3 SEMANTİK CHUNK SİSTEMİ
1. Tam pipeline çalıştır (CSV → Semantic Chunks → BGE-M3 → Qdrant)
2. İnteraktif arama yap
3. Koleksiyon bilgilerini göster
4. Çıkış

🏛️ YARGITAY BGE-M3 SEMANTİK ARAMA SİSTEMİ

🔍 Arama Seçenekleri:
1. Basit arama
2. Filtreli arama
3. Ana menüye dön


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


🔍 Query vector boyutu: 512 (hedef: 512)
📊 0 sonuç bulundu
❌ Sonuç bulunamadı

🔍 Arama Seçenekleri:
1. Basit arama
2. Filtreli arama
3. Ana menüye dön


  search_results = self.qdrant_client.search(



🏛️ YARGITAY BGE-M3 SEMANTİK CHUNK SİSTEMİ
1. Tam pipeline çalıştır (CSV → Semantic Chunks → BGE-M3 → Qdrant)
2. İnteraktif arama yap
3. Koleksiyon bilgilerini göster
4. Çıkış
👋 Görüşürüz!


In [1]:
# SemChunk + BGE-M3 + Qdrant Entegrasyon (512 boyut embedding)
import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Any
from dataclasses import dataclass
import json
import torch
from torch import nn
from dotenv import load_dotenv
import os

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

# Embed Reducer (1024 -> 512)
class EmbedReducer(nn.Module):
    def __init__(self, input_dim=1024, output_dim=512):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

@dataclass
class Config:
    BGE_MODEL_NAME: str = "BAAI/bge-m3"
    USE_FP16: bool = True
    DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "yargitay_bge_m3_chunks"
    EMBEDDING_DIM: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv"
    BATCH_SIZE: int = 100

class YargitaySemanticProcessor:
    def __init__(self, config: Config):
        self.config = config
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)
        self.bge_model = BGEM3FlagModel(config.BGE_MODEL_NAME, use_fp16=config.USE_FP16, device=config.DEVICE)
        self.reducer = EmbedReducer(1024, 512).to(config.DEVICE)
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "CPU"
        print(f"✅ Model ve sistem hazır - Kullanılan cihaz: {device_name}")

    def test_bge_connection(self):
        try:
            test_text = ["Yargıtay 6. Hukuk Dairesi'nin ihtiyati tedbir kararı"]
            embeddings = self.bge_model.encode(test_text)
            dense_embedding = embeddings['dense_vecs'][0]
            print(f"✅ BGE-M3 test başarılı - Dense embedding boyutu: {len(dense_embedding)}")
            return len(dense_embedding)
        except Exception as e:
            print(f"❌ BGE-M3 bağlantı hatası: {e}")
            return None

    def create_qdrant_collection(self, recreate: bool = False):
        if recreate:
            try:
                self.qdrant_client.delete_collection(self.config.COLLECTION_NAME)
                print(f"🗑️ Eski koleksiyon silindi")
            except:
                pass

        existing = [c.name for c in self.qdrant_client.get_collections().collections]
        if self.config.COLLECTION_NAME not in existing:
            self.qdrant_client.create_collection(
                collection_name=self.config.COLLECTION_NAME,
                vectors_config=VectorParams(size=self.config.EMBEDDING_DIM, distance=Distance.COSINE)
            )
            print(f"✅ Koleksiyon oluşturuldu: {self.config.COLLECTION_NAME}")
        else:
            print(f"ℹ️ Koleksiyon zaten var")

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text.strip():
            return []
        chunks = self.chunker(text)
        result_chunks = []
        for i, chunk_text in enumerate(chunks):
            if chunk_text.strip():
                chunk_data = {
                    'chunk_id': i,
                    'text': chunk_text.strip(),
                    'token_count': len(self.encoding.encode(chunk_text)),
                    'char_count': len(chunk_text)
                }
                if metadata:
                    chunk_data.update(metadata)
                result_chunks.append(chunk_data)
        return result_chunks

    def create_embeddings_bge(self, texts: List[str]) -> List[List[float]]:
        all_embeddings = []
        for i in range(0, len(texts), self.config.BATCH_SIZE):
            batch = texts[i:i + self.config.BATCH_SIZE]
            try:
                emb_result = self.bge_model.encode(batch)
                dense_embeddings = emb_result['dense_vecs']
                tensor_emb = torch.tensor(dense_embeddings, device=self.config.DEVICE, dtype=torch.float32)
                reduced = self.reducer(tensor_emb)
                all_embeddings.extend([v.detach().cpu().tolist() for v in reduced])
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"❌ Batch embedding hatası: {e}")
                all_embeddings.extend([[0.0]*self.config.EMBEDDING_DIM for _ in batch])
        return all_embeddings

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        df = pd.read_csv(csv_path)
        text_column = next((c for c in ['rawText','chunk_text','text','content','metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []
        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            all_chunks.extend(self.semantic_chunk_text(str(text), metadata))
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return
        embeddings = self.create_embeddings_bge([c['text'] for c in chunks])
        points = [PointStruct(id=str(uuid.uuid4()), vector=emb, payload=chunk)
                  for chunk, emb in zip(chunks, embeddings)]
        for i in range(0, len(points), 256):
            self.qdrant_client.upsert(collection_name=self.config.COLLECTION_NAME, points=points[i:i+256])
        print("🎉 Yükleme tamamlandı!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.7):
        query_emb_result = self.bge_model.encode([query])
        query_tensor = torch.tensor(query_emb_result['dense_vecs'], device=self.config.DEVICE, dtype=torch.float32)
        with torch.no_grad():
            reduced_query = self.reducer(query_tensor)
        query_vector = reduced_query[0].cpu().tolist()
        results = self.qdrant_client.query_points(
            collection_name=self.config.COLLECTION_NAME,
            query=query_vector,
            limit=limit,
            score_threshold=score_threshold
        )
        return [{'score': p.score, 'payload': p.payload} for p in results.points]

    def advanced_search_with_filters(self, query: str, filters: Dict = None, limit: int = 10, score_threshold: float = 0.6):
        query_emb_result = self.bge_model.encode([query])
        query_vector = query_emb_result['dense_vecs'][0].tolist()
        query_filter = None
        if filters:
            from qdrant_client.models import Filter, FieldCondition, MatchValue
            conditions = [FieldCondition(key=k, match=MatchValue(value=v)) for k, v in filters.items()]
            query_filter = Filter(must=conditions)
        results = self.qdrant_client.query_points(
            collection_name=self.config.COLLECTION_NAME,
            query=query_vector,
            query_filter=query_filter,
            limit=limit,
            score_threshold=score_threshold
        )
        return [{'score': p.score, 'payload': p.payload} for p in results.points]

    def get_collection_info(self):
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {"collection_name": self.config.COLLECTION_NAME, "points_count": info.points_count,
                    "vectors_count": info.vectors_count, "status": info.status, "embedding_model": "BGE-M3",
                    "embedding_dim": self.config.EMBEDDING_DIM}
        except Exception as e:
            return {"error": str(e)}

# Pipeline ve main fonksiyon aynı mantıkla entegre edildi
# interactive_search ve full_pipeline fonksiyonları reducer uyumlu


  from .autonotebook import tqdm as notebook_tqdm


True


In [2]:
class YargitayPipeline:
    """Ana pipeline sınıfı"""
    
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config
    
    def full_pipeline(self, csv_path: str = None):
        """Tam pipeline'ı çalıştır"""
        csv_path = csv_path or self.config.CSV_FILE
        
        print("🚀 Yargıtay BGE-M3 Semantic Pipeline Başlıyor")
        print("=" * 50)
        
        # 1. BGE-M3 modelini test et
        embedding_dim = self.processor.test_bge_connection()
        if not embedding_dim:
            return False
        
        # 2. Koleksiyon oluştur
        self.processor.create_qdrant_collection(recreate=True)
        
        # 3. CSV'yi işle
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ İşlenecek chunk bulunamadı")
            return False
        
        # 4. Qdrant'a yükle
        self.processor.upload_to_qdrant(chunks)
        
        # 5. Bilgileri göster
        info = self.processor.get_collection_info()
        print("\n📊 Koleksiyon Bilgileri:")
        print(json.dumps(info, indent=2, ensure_ascii=False))
        
        return True
    
    def interactive_search(self):
        """İnteraktif arama arayüzü"""
        print("\n" + "=" * 50)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK ARAMA SİSTEMİ")
        print("=" * 50)
        
        while True:
            print("\n🔍 Arama Seçenekleri:")
            print("1. Basit arama")
            print("2. Filtreli arama")
            print("3. Ana menüye dön")
            
            search_choice = input("Seçiminiz (1-3): ")
            
            if search_choice == "3":
                break
            elif search_choice not in ["1", "2"]:
                print("❌ Geçersiz seçim!")
                continue
            
            query = input("\n🔍 Arama metni (çıkmak için 'q'): ")
            if query.lower() in ['q', 'quit', 'exit']:
                break
            
            if not query.strip():
                continue
            
            try:
                limit = int(input("📊 Kaç sonuç? (varsayılan 5): ") or "5")
                #threshold = float(input("🎯 Minimum benzerlik skoru? (varsayılan 0.6): ") or "0.6")
            except:
                limit = 5
                #threshold = 0.6
            
            # Arama tipini belirle
            if search_choice == "1":
                results = self.processor.search_semantic(query, limit=limit)
            else:
                # Filtreli arama
                print("\n🔧 Filtre Seçenekleri (boş bırakabilirsiniz):")
                daire_filter = input("Daire filtresi (örn: '6. Hukuk Dairesi'): ").strip()
                
                filters = {}
                if daire_filter:
                    filters['daire'] = daire_filter
                
                results = self.processor.advanced_search_with_filters(
                    query, filters=filters if filters else None, 
                    limit=limit
                )
            
            if not results:
                print("❌ Sonuç bulunamadı")
                continue
            
            print(f"\n📋 {len(results)} sonuç bulundu:")
            print("-" * 60)
            
            for i, result in enumerate(results, 1):
                payload = result['payload']
                print(f"\n{i}. 📄 BGE-M3 Benzerlik Skoru: {result['score']:.3f}")
                print(f"   ⚖️ Esas No: {payload.get('esas_no', 'N/A')}")
                print(f"   📋 Karar No: {payload.get('karar_no', 'N/A')}")
                print(f"   🏛️ Daire: {payload.get('daire', 'N/A')}")
                print(f"   📅 Tarih: {payload.get('tarih', 'N/A')}")
                print(f"   🔤 Token: {payload.get('token_count', 'N/A')}")
                print(f"   📝 Metin Önizleme:")
                
                text = payload.get('text', '')
                preview = text[:300] + "..." if len(text) > 300 else text
                print(f"      {preview}")
                print("-" * 60)

# Kullanım örneği ve main fonksiyon

In [3]:
# Kullanım örneği ve main fonksiyon
def main():
    """Ana fonksiyon"""
    
    # Konfigürasyon
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv",
        TOKEN_SIZE=512,  # Chunk boyutu
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="bge_m3_chunks",
        EMBEDDING_DIM=512,
        BATCH_SIZE=100,  # GPU memory'ye göre ayarlayın
        USE_FP16=True,
        DEVICE="cuda" if torch.cuda.is_available() else "cpu"
    )
    
    # Pipeline oluştur
    pipeline = YargitayPipeline(config)
    
    # Menü göster
    while True:
        print("\n" + "=" * 50)
        print("🏛️ YARGITAY BGE-M3 SEMANTİK CHUNK SİSTEMİ")
        print("=" * 50)
        print("1. Tam pipeline çalıştır (CSV → Semantic Chunks → BGE-M3 → Qdrant)")
        print("2. İnteraktif arama yap")
        print("3. Koleksiyon bilgilerini göster")
        print("4. Çıkış")
        
        choice = input("\nSeçiminiz (1-4): ")
        
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip()
            if not csv_path:
                csv_path = config.CSV_FILE
            
            success = pipeline.full_pipeline(csv_path)
            if success:
                print("✅ BGE-M3 Pipeline başarıyla tamamlandı!")
            else:
                print("❌ Pipeline hatası!")
        
        elif choice == "2":
            pipeline.interactive_search()
        
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print("\n📊 Koleksiyon Bilgileri:")
            print(json.dumps(info, indent=2, ensure_ascii=False))
        
        elif choice == "4":
            print("👋 Görüşürüz!")
            break
        
        else:
            print("❌ Geçersiz seçim!")

if __name__ == "__main__":
    # BGE-M3 kurulumu kontrolü
    try:
        from FlagEmbedding import BGEM3FlagModel
        print("✅ FlagEmbedding kütüphanesi yüklü")
    except ImportError:
        print("❌ FlagEmbedding kütüphanesi bulunamadı!")
        print("Kurulum için: pip install FlagEmbedding")
        exit(1)
    
    main()

✅ FlagEmbedding kütüphanesi yüklü


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 102717.65it/s]


✅ Model ve sistem hazır - Kullanılan cihaz: NVIDIA RTX A6000

🏛️ YARGITAY BGE-M3 SEMANTİK CHUNK SİSTEMİ
1. Tam pipeline çalıştır (CSV → Semantic Chunks → BGE-M3 → Qdrant)
2. İnteraktif arama yap
3. Koleksiyon bilgilerini göster
4. Çıkış

🏛️ YARGITAY BGE-M3 SEMANTİK ARAMA SİSTEMİ

🔍 Arama Seçenekleri:
1. Basit arama
2. Filtreli arama
3. Ana menüye dön


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


❌ Sonuç bulunamadı

🔍 Arama Seçenekleri:
1. Basit arama
2. Filtreli arama
3. Ana menüye dön
❌ Geçersiz seçim!

🔍 Arama Seçenekleri:
1. Basit arama
2. Filtreli arama
3. Ana menüye dön

🏛️ YARGITAY BGE-M3 SEMANTİK CHUNK SİSTEMİ
1. Tam pipeline çalıştır (CSV → Semantic Chunks → BGE-M3 → Qdrant)
2. İnteraktif arama yap
3. Koleksiyon bilgilerini göster
4. Çıkış
👋 Görüşürüz!
