In [None]:
# SemChunk + Google EmbeddingGemma + Qdrant Entegrasyon
# Yargıtay Kararları için Semantic Chunking Pipeline

from qdrant_client.models import VectorParams, Distance, PointStruct
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from sklearn.decomposition import PCA
from dataclasses import dataclass
from dotenv import load_dotenv
from typing import List, Dict
import numpy as np
import pandas as pd
import tiktoken
import semchunk
import uuid
import json
import time
import os
import torch
from torch import nn 

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

hf_token = os.getenv("HF_API_KEY")


class EmbedReducer(nn.Module):
    def __init__(self, input_dim=768, output_dim=512):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)


# Konfigürasyon
@dataclass
class Config:
    # Google EmbeddingGemma ayarları
    MODEL_NAME: str = "google/embeddinggemma-300m"
    HF_TOKEN: str = hf_token

    # SemChunk ayarları
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"

    # Qdrant ayarları
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "gemma_semantic_chunks"
    DIMENSION: int = 512  #collection boyutu

    # Dosya ayarları
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/yargitay_cleaned_2025-08-21.csv"
    BATCH_SIZE: int = 100
    DB_BATCH_SIZE: int = 256
 
    
class YargitaySemanticProcessor:
    """Yargıtay kararları için semantic chunking ve vector search"""

    def __init__(self, config: Config):
        self.config = config

        # SemChunk chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # SentenceTransformer modeli
        self.model = SentenceTransformer(
            config.MODEL_NAME,
            token=config.HF_TOKEN
        )

        # Reducer modeli - tek instance kullanacağız
        self.reducer = EmbedReducer(768, 512).to('cuda:0')

        # Qdrant client
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        print(f"✅ SemChunk hazır (Token boyutu: {config.TOKEN_SIZE})")
        print(f"✅ Google EmbeddingGemma modeli hazır")
        print(f"✅ Qdrant client hazır ({config.QDRANT_URL})")

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except:
                pass

        try:
            collections = self.qdrant_client.get_collections().collections
            collection_names = [c.name for c in collections]

            if collection_name not in collection_names:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(
                        size=self.config.DIMENSION,
                        distance=Distance.COSINE
                    )
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name}")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")

        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or text.strip() == "":
            return []

        chunks = self.chunker(text)
        result_chunks = []

        for i, chunk_text in enumerate(chunks):
            if chunk_text.strip():
                chunk_data = {
                    'chunk_id': i,
                    'text': chunk_text.strip(),
                    'token_count': len(self.encoding.encode(chunk_text)),
                    'char_count': len(chunk_text),
                }
                if metadata:
                    chunk_data.update(metadata)
                result_chunks.append(chunk_data)
        return result_chunks

    def create_embeddings(self, texts: List[str], batch_size: int = 100, target_dim: int = 512) -> List[List[float]]:
        all_embeddings = []
        total = len(texts)
        
        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            start_embed = time.time()
            
            # Embedding oluştur
            batch_embeddings = self.model.encode(batch_texts, show_progress_bar=True, convert_to_tensor=True).to('cuda:0')
            batch_embeddings = batch_embeddings.clone().detach().requires_grad_(True)
            
            # Boyut düşür
            reduced_vector = self.reducer(batch_embeddings)
            print("*" * 40)
            print(f"Reduced vector shape: {reduced_vector.shape}")
            print("*" * 40)
            
            # CPU'ya taşı ve listeye çevir
            all_embeddings.extend(reduced_vector.cpu().tolist())
            
            end_embed = time.time()
            print(f"Batch embedding süresi: {end_embed - start_embed:.2f} saniye")
            print(f"  🔹 Embedding oluşturuldu: {i + len(batch_texts)}/{total}")

        return all_embeddings

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        all_chunks = []
        total_rows = len(df)
        print(f"📄 Toplam {total_rows} satır işlenecek")

        for idx, row in df.iterrows():
            text = row.get('rawText', '') or row.get('text', '')
            if not text or pd.isna(text):
                continue

            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', '')
            }

            chunks = self.semantic_chunk_text(str(text), metadata)
            all_chunks.extend(chunks)

            if (idx + 1) % 50 == 0 or (idx + 1) == total_rows:
                print(f"  ✅ İşlenen satır: {idx + 1}/{total_rows} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        texts = [chunk['text'] for chunk in chunks]
        print("🔮 Embedding'ler oluşturuluyor...")
        embeddings = self.create_embeddings(texts)

        points = [
            PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload=chunk
            ) for chunk, embedding in zip(chunks, embeddings)
        ]

        batch_size = self.config.DB_BATCH_SIZE
        total_points = len(points)
        print(f"🚀 {total_points} chunk Qdrant'a yükleniyor ({batch_size} batch size)")

        for i in range(0, total_points, batch_size):
            batch = points[i:i + batch_size]
            try:
                start_upload = time.time()

                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch
                )

                end_upload = time.time()
                print(f"batch Qdrant yükleme süresi: {end_upload - start_upload:.2f} saniye")
                print(f"  ✅ Batch yüklendi: {min(i + batch_size, total_points)}/{total_points}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print(f"🎉 {total_points} chunk Qdrant'a yüklendi!")

    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.7) -> List[Dict]:
        """Düzeltilmiş search metodu - 512 dimension uyumlu"""
        
        # Query embedding çıkar (768 boyut)
        query_embedding = self.model.encode([query], convert_to_tensor=True).to('cuda:0')
        query_embedding = query_embedding.clone().detach()
        
        # Boyutu 512'ye düşür (aynı reducer kullanarak)
        with torch.no_grad():  # Inference modunda
            reduced_query_embedding = self.reducer(query_embedding)
        
        # CPU'ya taşı ve numpy array'e çevir
        query_vector = reduced_query_embedding[0].cpu().numpy().tolist()
        
        print(f"🔍 Query vector boyutu: {len(query_vector)} (hedef: {self.config.DIMENSION})")
        
        # Qdrant araması
        search_results = self.qdrant_client.search(
            collection_name=self.config.COLLECTION_NAME,
            query_vector=query_vector,
            limit=limit,
            score_threshold=score_threshold
        )
        
        results = [{'score': p.score, 'payload': p.payload} for p in search_results]
        return results

    def get_collection_info(self) -> dict:
        """Koleksiyon bilgilerini al"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status
            }
        except Exception as e:
            return {"error": str(e)}


# Pipeline sınıfı
class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Yargıtay Semantic Pipeline Başlıyor")

        total_start = time.time()  # Toplam süre başlangıcı

        # Koleksiyon oluşturma
        self.processor.create_qdrant_collection(recreate=True)

        # CSV işlemleri ve chunk oluşturma
        chunk_start = time.time()
        chunks = self.processor.process_csv_file(csv_path)
        chunk_end = time.time()

        if not chunks:
            print("❌ İşlenecek chunk bulunamadı")
            return False

        # Embedding oluşturma ve Qdrant yükleme
        upload_start = time.time()
        self.processor.upload_to_qdrant(chunks)
        upload_end = time.time()

        total_end = time.time()  # Toplam süre bitişi

        # Toplam istatistikler
        info = self.processor.get_collection_info()
        print("\n📊 Pipeline Süreleri ve İstatistikler:")
        print(json.dumps({
            "collection_name": self.config.COLLECTION_NAME,
            "points_uploaded": info.get("points_count", 0),
            "chunk_creation_time_s": round(chunk_end - chunk_start, 2),
            "embedding_and_upload_time_s": round(upload_end - upload_start, 2),
            "total_pipeline_time_s": round(total_end - total_start, 2)
        }, indent=2, ensure_ascii=False))

        return True

    def interactive_search(self):
        print("🔍 İnteraktif Arama Başlatıldı")
        
        # Önce koleksiyon durumunu kontrol et
        info = self.processor.get_collection_info()
        print(f"📊 Koleksiyon Durumu: {json.dumps(info, indent=2, ensure_ascii=False)}")
        
        while True:
            query = input("\n🔍 Arama metni (çıkmak için 'q'): ").strip()
            if query.lower() in ['q', 'quit', 'exit']:
                break
                
            if not query:
                print("❌ Boş sorgu, tekrar deneyin")
                continue
            
            limit_input = input("Kaç sonuç? (default 5): ").strip()
            try:
                limit = int(limit_input) if limit_input else 5
                limit = max(1, min(limit, 50))  # 1-50 arası sınırla
            except ValueError:
                print("❌ Geçersiz sayı, varsayılan 5 kullanılıyor")
                limit = 5

            score_input = input("Minimum score? (default 0.7): ").strip()
            try:
                score_threshold = float(score_input) if score_input else 0.7
                score_threshold = max(0.0, min(score_threshold, 1.0))  # 0-1 arası sınırla
            except ValueError:
                print("❌ Geçersiz score, varsayılan 0.7 kullanılıyor")
                score_threshold = 0.7

            print(f"\n{'='*60}")
            results = self.processor.search_semantic(query, limit=limit, score_threshold=score_threshold)
            
            if not results:
                print("❌ Hiç sonuç bulunamadı. Score threshold'u düşürmeyi deneyin.")
            else:
                print(f"\n📋 {len(results)} Sonuç Bulundu:")
                for i, r in enumerate(results, 1):
                    payload = r['payload']
                    text_preview = payload.get('text', '')[:200] + "..." if len(payload.get('text', '')) > 200 else payload.get('text', '')
                    print(f"\n{i}. 📊 Score: {r['score']:.4f}")
                    print(f"   📋 Esas No: {payload.get('esas_no', 'N/A')}")
                    print(f"   📋 Karar No: {payload.get('karar_no', 'N/A')}")
                    print(f"   📋 Daire: {payload.get('daire', 'N/A')}")
                    print(f"   📋 Tarih: {payload.get('tarih', 'N/A')}")
                    print(f"   📄 Metin: {text_preview}")
                    print(f"   {'─'*50}")
            
            print(f"{'='*60}")



# Main fonksiyon
def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/yargitay_cleaned_2025-08-21.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="google_embeddinggemma_chunks",
        DIMENSION=512
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n" + "="*60)
        print("1. Full pipeline çalıştır")
        print("2. Arama yap") 
        print("3. Koleksiyon bilgisi")
        print("4. 🔧 Debug arama sorunu")
        print("5. 🗑️ Dimension uyumsuzluğunu düzelt")
        print("6. Çıkış")
        print("="*60)
        
        choice = input("Seçim: ").strip()
        
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            pipeline.full_pipeline(csv_path)
            
        elif choice == "2":
            pipeline.interactive_search()
            
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print("\n📊 Koleksiyon Bilgileri:")
            print(json.dumps(info, indent=2, ensure_ascii=False))
            
        elif choice == "4":
            query = input("Debug için test query (Enter: 'test'): ").strip() or "test"
            pipeline.processor.debug_search_issue(query)
            
        elif choice == "5":
            if pipeline.processor.fix_dimension_mismatch():
                print("✅ Koleksiyon sıfırlandı. Şimdi '1' seçeneği ile veriyi yeniden yükleyin.")
            else:
                print("❌ İşlem iptal edildi")
                
        elif choice == "6":
            break
        else:
            print("❌ Geçersiz seçim")


if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


True
✅ SemChunk hazır (Token boyutu: 512)
✅ Google EmbeddingGemma modeli hazır
✅ Qdrant client hazır (http://localhost:6333)

1. Full pipeline çalıştır
2. Arama yap
3. Koleksiyon bilgisi
4. Çıkış
🔍 İnteraktif Arama Başlatıldı
📊 Koleksiyon Durumu: {
  "collection_name": "google_embeddinggemma_chunks",
  "points_count": 0,
  "vectors_count": null,
  "status": "green"
}

🔍 Query vector boyutu: 512 (hedef: 512)
❌ Hiç sonuç bulunamadı. Score threshold'u düşürmeyi deneyin.


  search_results = self.qdrant_client.search(



🔍 Query vector boyutu: 512 (hedef: 512)
❌ Hiç sonuç bulunamadı. Score threshold'u düşürmeyi deneyin.

🔍 Query vector boyutu: 512 (hedef: 512)
❌ Hiç sonuç bulunamadı. Score threshold'u düşürmeyi deneyin.

1. Full pipeline çalıştır
2. Arama yap
3. Koleksiyon bilgisi
4. Çıkış


### AAAAAAAAAAAAAAAAAAA

In [None]:
# SemChunk + Google EmbeddingGemma + Qdrant Entegrasyon
# Yargıtay Kararları için Semantic Chunking Pipeline

from qdrant_client.models import VectorParams, Distance, PointStruct
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from sklearn.decomposition import PCA
from dataclasses import dataclass
from dotenv import load_dotenv
from typing import List, Dict
import numpy as np
import pandas as pd
import tiktoken
import semchunk
import uuid
import json
import time
import os
from torch import nn 

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

hf_token = os.getenv("HF_API_KEY")


class EmbedReducer(nn.Module):
    def __init__(self, input_dim=768, output_dim=512):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)


# Konfigürasyon
@dataclass
class Config:
    # Google EmbeddingGemma ayarları
    MODEL_NAME: str = "google/embeddinggemma-300m"
    HF_TOKEN: str = hf_token

    # SemChunk ayarları
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"

    # Qdrant ayarları
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "gemma_semantic_chunks"
    DIMENSION: int = 512  #collection boyutu

    # Dosya ayarları
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/yargitay_cleaned_2025-08-21.csv"
    BATCH_SIZE: int = 100
    DB_BATCH_SIZE=256
 
    
    
class YargitaySemanticProcessor:
    """Yargıtay kararları için semantic chunking ve vector search"""

    def __init__(self, config: Config):
        self.config = config

        # SemChunk chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # SentenceTransformer modeli
        self.model = SentenceTransformer(
            config.MODEL_NAME,
            token=config.HF_TOKEN
        )

        self.reducer = EmbedReducer(768, 512).to('cuda:0')

        # Qdrant client
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        print(f"✅ SemChunk hazır (Token boyutu: {config.TOKEN_SIZE})")
        print(f"✅ Google EmbeddingGemma modeli hazır")
        print(f"✅ Qdrant client hazır ({config.QDRANT_URL})")

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except:
                pass

        try:
            collections = self.qdrant_client.get_collections().collections
            collection_names = [c.name for c in collections]

            if collection_name not in collection_names:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(
                        size=self.config.DIMENSION,
                        distance=Distance.COSINE
                    )
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name}")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")

        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or text.strip() == "":
            return []

        chunks = self.chunker(text)
        result_chunks = []

        for i, chunk_text in enumerate(chunks):
            if chunk_text.strip():
                chunk_data = {
                    'chunk_id': i,
                    'text': chunk_text.strip(),
                    'token_count': len(self.encoding.encode(chunk_text)),
                    'char_count': len(chunk_text),
                }
                if metadata:
                    chunk_data.update(metadata)
                result_chunks.append(chunk_data)
        return result_chunks


    def create_embeddings(self, texts: List[str], batch_size: int = 100, target_dim: int=512) -> List[List[float]]:
        all_embeddings = []
        total = len(texts)
        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            start_embed = time.time()
            
            batch_embeddings = self.model.encode(batch_texts, show_progress_bar=True, convert_to_tensor=True).to('cuda:0')
            batch_embeddings = batch_embeddings.clone().detach().requires_grad_(True) #kopyası alındı bağımsız oldu ve tekrar gradyan hesaplanılabilir versiyona getirildi
            reducer = EmbedReducer(768, 512).to('cuda:0')
            reduced_vector = reducer(batch_embeddings)
            print("*"*40)
            print(reduced_vector.shape)
            print("*"*40)
            all_embeddings.extend(reduced_vector.tolist())
            
            end_embed = time.time()
            print(f"Batch embedding süresi: {end_embed - start_embed:.2f} saniye")
            print(f"  🔹 Embedding oluşturuldu: {i + len(batch_texts)}/{total}")

        # all_embeddings=np.array(all_embeddings)
        # print(f"🔹 PCA ile boyut düşürülüyor: {all_embeddings.shape[1]} -> {target_dim}")
        # pca = PCA(n_components=target_dim)
        # reduced_embeddings = pca.fit_transform(all_embeddings)
        # print(f"✅ PCA tamamlandı, shape: {reduced_embeddings.shape}")

        return all_embeddings


    def process_csv_file(self, csv_path: str) -> List[Dict]:
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        all_chunks = []
        total_rows = len(df)
        print(f"📄 Toplam {total_rows} satır işlenecek")

        for idx, row in df.iterrows():
            text = row.get('rawText', '') or row.get('text', '')
            if not text or pd.isna(text):
                continue

            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', '')
            }

            #start_chunk = time.time()

            chunks = self.semantic_chunk_text(str(text), metadata)
            all_chunks.extend(chunks)

            #end_chunk = time.time()

            

            if (idx + 1) % 50 == 0 or (idx + 1) == total_rows:
                print(f"  ✅ İşlenen satır: {idx + 1}/{total_rows} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        texts = [chunk['text'] for chunk in chunks]
        print("🔮 Embedding'ler oluşturuluyor...")
        embeddings = self.create_embeddings(texts)

        points = [
            PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload=chunk
            ) for chunk, embedding in zip(chunks, embeddings)
        ]

        batch_size = self.config.DB_BATCH_SIZE
        total_points = len(points)
        print(f"🚀 {total_points} chunk Qdrant'a yükleniyor ({batch_size} batch size)")

        for i in range(0, total_points, batch_size):
            batch = points[i:i + batch_size]
            try:
                start_upload = time.time()

                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch
                )

                end_upload = time.time()
                print(f"batch Qdrant yükleme süresi: {end_upload - start_upload:.2f} saniye")

                print(f"  ✅ Batch yüklendi: {min(i + batch_size, total_points)}/{total_points}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print(f"🎉 {total_points} chunk Qdrant'a yüklendi!")


    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.7) -> List[Dict]:
        # Query embedding çıkar
        query_embedding = self.model.encode([query], convert_to_tensor=True)
        query_embedding = query_embedding.clone().detach().to('cuda:0')  # 🔹 clone ve detach eklendi
        query_embedding = self.reducer(query_embedding)
        query_embedding = query_embedding[0].cpu().tolist()  # numpy list
        
        # Qdrant araması
        search_results = self.qdrant_client.search(
            collection_name=self.config.COLLECTION_NAME,
            query_vector=query_embedding,
            limit=limit,
            score_threshold=score_threshold
        )
        
        results = [{'score': p.score, 'payload': p.payload} for p in search_results]
        return results



    def get_collection_info(self) -> dict:
        """Koleksiyon bilgilerini al"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status
            }
        except Exception as e:
            return {"error": str(e)}

# Pipeline sınıfı
class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Yargıtay Semantic Pipeline Başlıyor")

        total_start = time.time()  # Toplam süre başlangıcı

        # Koleksiyon oluşturma
        self.processor.create_qdrant_collection(recreate=True)

        # CSV işlemleri ve chunk oluşturma
        chunk_start = time.time()
        chunks = self.processor.process_csv_file(csv_path)
        chunk_end = time.time()

        if not chunks:
            print("❌ İşlenecek chunk bulunamadı")
            return False

        # Embedding oluşturma ve Qdrant yükleme
        upload_start = time.time()
        self.processor.upload_to_qdrant(chunks)
        upload_end = time.time()

        total_end = time.time()  # Toplam süre bitişi

        # Toplam istatistikler
        info = self.processor.get_collection_info()
        print("\n📊 Pipeline Süreleri ve İstatistikler:")
        print(json.dumps({
            "collection_name": self.config.COLLECTION_NAME,
            "points_uploaded": info.get("points_count", 0),
            "chunk_creation_time_s": round(chunk_end - chunk_start, 2),
            "embedding_and_upload_time_s": round(upload_end - upload_start, 2),
            "total_pipeline_time_s": round(total_end - total_start, 2)
        }, indent=2, ensure_ascii=False))

        return True

    def interactive_search(self):
        while True:
            query = input("🔍 Arama metni (çıkmak için 'q'): ")
            if query.lower() in ['q', 'quit', 'exit']:
                break
            
            limit_input = input("Kaç sonuç? (default 5): ").strip()
            try:
                limit = int(limit_input) if limit_input else 5
            except ValueError:
                print("❌ Geçersiz sayı, varsayılan 5 kullanılıyor")
                limit = 5

            results = self.processor.search_semantic(query, limit=limit)
            for i, r in enumerate(results, 1):
                payload = r['payload']
                text_preview = payload.get('text', '')[:300] + "..."
                print(f"\n{i}. Score: {r['score']:.3f}, Esas No: {payload.get('esas_no')}, Metin: {text_preview}")

# Main fonksiyon
def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/yargitay_cleaned_2025-08-21.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="google_embeddinggemma_chunks",
        DIMENSION=512
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n1. Full pipeline çalıştır\n2. Arama yap\n3. Koleksiyon bilgisi\n4. Çıkış")
        choice = input("Seçim: ")
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            pipeline.full_pipeline(csv_path)
        elif choice == "2":
            pipeline.interactive_search()
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice == "4":
            break
        else:
            print("❌ Geçersiz seçim")

if __name__ == "__main__":
    main()


In [None]:
import torch.nn as nn

class EmbedReducer(nn.Module):
    def __init__(self, input_dim=768, output_dim=512):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

# Model oluştur
reducer = EmbedReducer(768, 512).to('cuda:0')

# Embed işlemi
model = SentenceTransformer("google/embeddinggemma-300m", device=device)
dense_vector = model.encode(texts, convert_to_tensor=True).to('cuda:0')
dense_vector = dense_vector.clone().detach().requires_grad_(True) 

print(reducer)

reduced_vector = reducer(dense_vector)

print(reduced_vector.shape)
print(reduced_vector)


In [None]:
# SemChunk + Google EmbeddingGemma + Qdrant Entegrasyon
# Yargıtay Kararları için Semantic Chunking Pipeline

from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from dataclasses import dataclass
from dotenv import load_dotenv
from typing import List, Dict
import numpy as np
import pandas as pd
import tiktoken
import semchunk
import uuid
import json
import time
import os
import torch
from torch import nn

# ------------------------ ENV ------------------------
print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))
hf_token = os.getenv("HF_API_KEY")

# ------------------------ EmbedReducer ------------------------
class EmbedReducer(nn.Module):
    def __init__(self, input_dim=768, output_dim=512):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

# ------------------------ Config ------------------------
@dataclass
class Config:
    MODEL_NAME: str = "google/embeddinggemma-300m"
    HF_TOKEN: str = hf_token
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "gemma_semantic_chunks"
    DIMENSION: int = 512
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/yargitay_cleaned_2025-08-21.csv"
    BATCH_SIZE: int = 100
    DB_BATCH_SIZE: int = 256

# ------------------------ Processor ------------------------
class YargitaySemanticProcessor:
    """Yargıtay kararları için semantic chunking ve vector search"""

    def __init__(self, config: Config):
        self.config = config
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        self.model = SentenceTransformer(
            config.MODEL_NAME,
            token=config.HF_TOKEN
        )

        self.reducer = EmbedReducer(768, config.DIMENSION).to('cuda:0')
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        print(f"✅ SemChunk hazır (Token boyutu: {config.TOKEN_SIZE})")
        print(f"✅ Google EmbeddingGemma modeli hazır")
        print(f"✅ Qdrant client hazır ({config.QDRANT_URL})")

    # ------------------------ Qdrant ------------------------
    def create_qdrant_collection(self, recreate: bool = False):
        name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(name)
                print(f"🗑️ Eski koleksiyon silindi: {name}")
            except:
                pass
        try:
            collections = [c.name for c in self.qdrant_client.get_collections().collections]
            if name not in collections:
                self.qdrant_client.create_collection(
                    collection_name=name,
                    vectors_config=VectorParams(
                        size=self.config.DIMENSION,
                        distance=Distance.COSINE
                    )
                )
                print(f"✅ Koleksiyon oluşturuldu: {name}")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    # ------------------------ Chunking ------------------------
    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or text.strip() == "":
            return []
        chunks = self.chunker(text)
        result = []
        for i, chunk in enumerate(chunks):
            if chunk.strip():
                data = {
                    'chunk_id': i,
                    'text': chunk.strip(),
                    'token_count': len(self.encoding.encode(chunk)),
                    'char_count': len(chunk),
                }
                if metadata:
                    data.update(metadata)
                result.append(data)
        return result

    # ------------------------ Embedding ------------------------
    def create_embeddings(self, texts: List[str], batch_size: int = 100) -> List[List[float]]:
        all_embeddings = []
        total = len(texts)
        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            start_time = time.time()

            batch_embeddings = self.model.encode(
                batch_texts, show_progress_bar=True, convert_to_tensor=True
            ).to('cuda:0')

            with torch.no_grad():
                reduced_vectors = self.reducer(batch_embeddings).cpu().tolist()

            all_embeddings.extend(reduced_vectors)

            end_time = time.time()
            print(f"Batch embedding süresi: {end_time - start_time:.2f}s | {i+len(batch_texts)}/{total}")

        return all_embeddings

    # ------------------------ CSV ------------------------
    def process_csv_file(self, csv_path: str) -> List[Dict]:
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        all_chunks = []
        total_rows = len(df)
        print(f"📄 Toplam {total_rows} satır işlenecek")

        for idx, row in df.iterrows():
            text = row.get('rawText', '') or row.get('text', '')
            if not text or pd.isna(text):
                continue

            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', '')
            }

            chunks = self.semantic_chunk_text(str(text), metadata)
            all_chunks.extend(chunks)

            if (idx + 1) % 50 == 0 or (idx + 1) == total_rows:
                print(f"  ✅ İşlenen satır: {idx + 1}/{total_rows} | Toplam chunk: {len(all_chunks)}")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    # ------------------------ Upload ------------------------
    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        texts = [c['text'] for c in chunks]
        print("🔮 Embedding'ler oluşturuluyor...")
        embeddings = self.create_embeddings(texts)

        points = [
            PointStruct(id=str(uuid.uuid4()), vector=e, payload=c)
            for c, e in zip(chunks, embeddings)
        ]

        batch_size = self.config.DB_BATCH_SIZE
        total_points = len(points)
        print(f"🚀 {total_points} chunk Qdrant'a yükleniyor ({batch_size} batch size)")

        for i in range(0, total_points, batch_size):
            batch = points[i:i+batch_size]
            try:
                start_time = time.time()
                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch
                )
                end_time = time.time()
                print(f"Batch yükleme süresi: {end_time - start_time:.2f}s | {min(i+batch_size, total_points)}/{total_points}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print(f"🎉 {total_points} chunk Qdrant'a yüklendi!")

    # ------------------------ Search ------------------------
    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.7) -> List[Dict]:
    # Query embedding çıkar
        query_emb = self.model.encode([query], convert_to_tensor=True).to('cuda:0')
        with torch.no_grad():
            query_emb = self.reducer(query_emb)
        query_emb = query_emb[0].cpu().tolist()

        # Qdrant araması (search kullanımı)
        search_results = self.qdrant_client.search(
            collection_name=self.config.COLLECTION_NAME,
            query_vector=query_emb,
            limit=limit,
            score_threshold=score_threshold,
            with_payload=True
        )

        results = [{'score': r.score, 'payload': r.payload} for r in search_results]
        return results


    # ------------------------ Collection Info ------------------------
    def get_collection_info(self) -> dict:
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status
            }
        except Exception as e:
            return {"error": str(e)}

# ------------------------ Pipeline ------------------------
class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Yargıtay Semantic Pipeline Başlıyor")

        start_total = time.time()
        self.processor.create_qdrant_collection(recreate=True)

        start_chunk = time.time()
        chunks = self.processor.process_csv_file(csv_path)
        end_chunk = time.time()

        if not chunks:
            print("❌ İşlenecek chunk yok")
            return False

        start_upload = time.time()
        self.processor.upload_to_qdrant(chunks)
        end_upload = time.time()

        end_total = time.time()
        info = self.processor.get_collection_info()

        print("\n📊 Pipeline Süreleri ve İstatistikler:")
        print(json.dumps({
            "collection_name": self.config.COLLECTION_NAME,
            "points_uploaded": info.get("points_count", 0),
            "chunk_creation_time_s": round(end_chunk - start_chunk, 2),
            "embedding_and_upload_time_s": round(end_upload - start_upload, 2),
            "total_pipeline_time_s": round(end_total - start_total, 2)
        }, indent=2, ensure_ascii=False))

        return True

    # ------------------------ Interactive Search ------------------------
    def interactive_search(self):
        while True:
            query = input("🔍 Arama metni (çıkmak için 'q'): ").strip()
            if query.lower() in ['q', 'quit', 'exit']:
                break

            limit_input = input("Kaç sonuç? (default 5): ").strip()
            try:
                limit = int(limit_input) if limit_input else 5
            except ValueError:
                print("❌ Geçersiz sayı, varsayılan 5 kullanılıyor")
                limit = 5

            results = self.processor.search_semantic(query, limit=limit)
            for i, r in enumerate(results, 1):
                payload = r['payload']
                text_preview = payload.get('text', '')[:300] + "..."
                print(f"\n{i}. Score: {r['score']:.3f}, Esas No: {payload.get('esas_no')}, Metin: {text_preview}")

# ------------------------ Main ------------------------
def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/yargitay_cleaned_2025-08-21.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="google_embeddinggemma_chunks",
        DIMENSION=512
    )

    pipeline = YargitayPipeline(config)

    while True:
        print("\n1. Full pipeline çalıştır\n2. Arama yap\n3. Koleksiyon bilgisi\n4. Çıkış")
        choice = input("Seçim: ").strip()
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            pipeline.full_pipeline(csv_path)
        elif choice == "2":
            pipeline.interactive_search()
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice == "4":
            break
        else:
            print("❌ Geçersiz seçim")

if __name__ == "__main__":
    main()
