In [None]:
# SemChunk + Google EmbeddingGemma + Qdrant Entegrasyon
# Yargıtay Kararları için Semantic Chunking Pipeline

import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import uuid
from typing import List, Dict
import os
from dataclasses import dataclass
import json
from dotenv import load_dotenv

print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

hf_token = os.getenv("HF_API_KEY")

# Konfigürasyon
@dataclass
class Config:
    # Google EmbeddingGemma ayarları
    MODEL_NAME: str = "google/embeddinggemma-300m"
    HF_TOKEN: str = hf_token

    # SemChunk ayarları
    TOKEN_SIZE: int = 512
    ENCODING_NAME: str = "cl100k_base"

    # Qdrant ayarları
    QDRANT_URL: str = "http://localhost:6333"
    COLLECTION_NAME: str = "gemma_semantic_chunks"
    DIMENSION: int = 768  # embeddinggemma-300m embedding boyutu

    # Dosya ayarları
    CSV_FILE: str = "/home/yapayzeka/ahsen_bulbul/data/yargitay_cleaned_2025-08-21.csv"
    BATCH_SIZE: int = 256

class YargitaySemanticProcessor:
    """Yargıtay kararları için semantic chunking ve vector search"""

    def __init__(self, config: Config):
        self.config = config

        # SemChunk chunker
        self.encoding = tiktoken.get_encoding(config.ENCODING_NAME)
        self.chunker = semchunk.chunkerify(self.encoding, config.TOKEN_SIZE)

        # SentenceTransformer modeli
        self.model = SentenceTransformer(
            config.MODEL_NAME,
            token=config.HF_TOKEN
        )

        # Qdrant client
        self.qdrant_client = QdrantClient(url=config.QDRANT_URL)

        print(f"✅ SemChunk hazır (Token boyutu: {config.TOKEN_SIZE})")
        print(f"✅ Google EmbeddingGemma modeli hazır")
        print(f"✅ Qdrant client hazır ({config.QDRANT_URL})")

    def create_qdrant_collection(self, recreate: bool = False):
        collection_name = self.config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except:
                pass

        try:
            collections = self.qdrant_client.get_collections().collections
            collection_names = [c.name for c in collections]

            if collection_name not in collection_names:
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=VectorParams(
                        size=self.config.DIMENSION,
                        distance=Distance.COSINE
                    )
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name}")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")

        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or text.strip() == "":
            return []

        chunks = self.chunker(text)
        result_chunks = []

        for i, chunk_text in enumerate(chunks):
            if chunk_text.strip():
                chunk_data = {
                    'chunk_id': i,
                    'text': chunk_text.strip(),
                    'token_count': len(self.encoding.encode(chunk_text)),
                    'char_count': len(chunk_text),
                }
                if metadata:
                    chunk_data.update(metadata)
                result_chunks.append(chunk_data)
        return result_chunks

    def create_embeddings(self, texts: List[str], batch_size: int = 256) -> List[List[float]]:
        all_embeddings = []
        total = len(texts)
        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = self.model.encode(batch_texts, show_progress_bar=True)
            all_embeddings.extend(batch_embeddings.tolist())
            print(f"  🔹 Embedding oluşturuldu: {i + len(batch_texts)}/{total}")
        return all_embeddings

    def process_csv_file(self, csv_path: str) -> List[Dict]:
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return []

        all_chunks = []
        total_rows = len(df)
        print(f"📄 Toplam {total_rows} satır işlenecek")

        for idx, row in df.iterrows():
            text = row.get('rawText', '') or row.get('text', '')
            if not text or pd.isna(text):
                continue

            metadata = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', '')
            }

            chunks = self.semantic_chunk_text(str(text), metadata)
            all_chunks.extend(chunks)

            if (idx + 1) % 50 == 0 or (idx + 1) == total_rows:
                print(f"  ✅ İşlenen satır: {idx + 1}/{total_rows} (Toplam chunk: {len(all_chunks)})")

        print(f"🧩 Toplam {len(all_chunks)} chunk oluşturuldu")
        return all_chunks

    def upload_to_qdrant(self, chunks: List[Dict]):
        if not chunks:
            print("❌ Yüklenecek chunk yok")
            return

        texts = [chunk['text'] for chunk in chunks]
        print("🔮 Embedding'ler oluşturuluyor...")
        embeddings = self.create_embeddings(texts)

        points = [
            PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload=chunk
            ) for chunk, embedding in zip(chunks, embeddings)
        ]

        batch_size = self.config.BATCH_SIZE
        total_points = len(points)
        print(f"🚀 {total_points} chunk Qdrant'a yükleniyor ({batch_size} batch size)")

        for i in range(0, total_points, batch_size):
            batch = points[i:i + batch_size]
            try:
                self.qdrant_client.upsert(
                    collection_name=self.config.COLLECTION_NAME,
                    points=batch
                )
                print(f"  ✅ Batch yüklendi: {min(i + batch_size, total_points)}/{total_points}")
            except Exception as e:
                print(f"❌ Batch yükleme hatası: {e}")

        print(f"🎉 {total_points} chunk Qdrant'a yüklendi!")


    def search_semantic(self, query: str, limit: int = 10, score_threshold: float = 0.7) -> List[Dict]:
        """Semantic arama yap"""
        query_embedding = self.model.encode([query])[0]
        search_results = self.qdrant_client.search(
            collection_name=self.config.COLLECTION_NAME,
            query_vector=query_embedding,
            limit=limit,
            score_threshold=score_threshold
        )
        results = [{'score': p.score, 'payload': p.payload} for p in search_results]
        return results

    def get_collection_info(self) -> dict:
        """Koleksiyon bilgilerini al"""
        try:
            info = self.qdrant_client.get_collection(self.config.COLLECTION_NAME)
            return {
                "collection_name": self.config.COLLECTION_NAME,
                "points_count": info.points_count,
                "vectors_count": info.vectors_count,
                "status": info.status
            }
        except Exception as e:
            return {"error": str(e)}

# Pipeline sınıfı
class YargitayPipeline:
    def __init__(self, config: Config):
        self.processor = YargitaySemanticProcessor(config)
        self.config = config

    def full_pipeline(self, csv_path: str = None):
        csv_path = csv_path or self.config.CSV_FILE
        print("🚀 Yargıtay Semantic Pipeline Başlıyor")
        self.processor.create_qdrant_collection(recreate=True)
        chunks = self.processor.process_csv_file(csv_path)
        if not chunks:
            print("❌ İşlenecek chunk bulunamadı")
            return False
        self.processor.upload_to_qdrant(chunks)
        info = self.processor.get_collection_info()
        print(json.dumps(info, indent=2, ensure_ascii=False))
        return True

    def interactive_search(self):
        while True:
            query = input("🔍 Arama metni (çıkmak için 'q'): ")
            if query.lower() in ['q', 'quit', 'exit']:
                break
            limit = int(input("Kaç sonuç? (default 5): ") or 5)
            results = self.processor.search_semantic(query, limit=limit)
            for i, r in enumerate(results, 1):
                payload = r['payload']
                text_preview = payload.get('text', '')[:300] + "..."
                print(f"\n{i}. Score: {r['score']:.3f}, Esas No: {payload.get('esas_no')}, Metin: {text_preview}")

# Main fonksiyon
def main():
    config = Config(
        CSV_FILE="/home/yapayzeka/ahsen_bulbul/data/yargitay_cleaned_2025-08-21.csv",
        TOKEN_SIZE=512,
        QDRANT_URL="http://localhost:6333",
        COLLECTION_NAME="google_embeddinggemma_chunks",
        DIMENSION=768
    )
    pipeline = YargitayPipeline(config)

    while True:
        print("\n1. Full pipeline çalıştır\n2. Arama yap\n3. Koleksiyon bilgisi\n4. Çıkış")
        choice = input("Seçim: ")
        if choice == "1":
            csv_path = input(f"CSV dosya yolu (Enter: {config.CSV_FILE}): ").strip() or config.CSV_FILE
            pipeline.full_pipeline(csv_path)
        elif choice == "2":
            pipeline.interactive_search()
        elif choice == "3":
            info = pipeline.processor.get_collection_info()
            print(json.dumps(info, indent=2, ensure_ascii=False))
        elif choice == "4":
            break
        else:
            print("❌ Geçersiz seçim")

if __name__ == "__main__":
    main()
