In [5]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall


In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from qdrant_client.http import models
import uuid
from tqdm import tqdm
import json
from typing import List, Dict, Any
import re
from dotenv import load_dotenv
import os

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

qdrant_url = os.getenv("QDRANT_URL")
api_key = os.getenv("QDRANT_API_KEY")
collection_name = os.getenv("QDRANT_COLLECTION")

In [7]:
class LegalDocumentVectorDB:
    def __init__(self, qdrant_url: str, api_key: str, collection_name: str = "hukuki kararlar"):
        """
        Hukuki belgeler için vector database sınıfı
        
        Args:
            qdrant_url: Qdrant sunucu URL'i
            api_key: Qdrant API anahtarı  
            collection_name: Collection adı
        """
        self.client = QdrantClient(
            url=qdrant_url,
            api_key=api_key,
            timeout=60
        )
        self.collection_name = collection_name
        
        # Türkçe için optimize edilmiş multilingual model
        print("Sentence Transformer modeli yükleniyor...")
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
        print("Model başarıyla yüklendi!")
        
        # Vector boyutunu al
        self.vector_size = self.model.get_sentence_embedding_dimension()
        print(f"Vector boyutu: {self.vector_size}")

    def create_collection(self, recreate: bool = False):
        """Collection oluştur"""
        try:
            if recreate:
                self.client.delete_collection(collection_name=self.collection_name)
                print(f"Eski collection '{self.collection_name}' silindi.")
        except:
            pass
        
        try:
            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(
                    size=self.vector_size, 
                    distance=Distance.COSINE
                )
            )
            print(f"Collection '{self.collection_name}' oluşturuldu.")
        except Exception as e:
            if "already exists" in str(e).lower():
                print(f"Collection '{self.collection_name}' zaten mevcut.")
            else:
                raise e

    def clean_text(self, text: str) -> str:
        """Metni temizle"""
        if pd.isna(text):
            return ""
        
        # Encoding sorunlarını düzelt
        text = str(text)
        replacements = {
            'Ã¤': 'ä', 'Ã¶': 'ö', 'Ã¼': 'ü', 'ÃŸ': 'ß',
            'Ã‡': 'Ç', 'Ä±': 'ı', 'Ä°': 'İ', 'ÅŸ': 'ş',
            'Ä\x9f': 'ğ', 'Ã§': 'ç', 'Ã¶': 'ö', 'Ã¼': 'ü'
        }
        
        for old, new in replacements.items():
            text = text.replace(old, new)
        
        # Fazla boşlukları temizle
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        
        return text

    def process_csv(self, csv_path: str) -> pd.DataFrame:
        """CSV dosyasını işle"""
        print(f"CSV dosyası okunuyor: {csv_path}")
        
        # Dosya varlığını kontrol et
        import os
        if not os.path.exists(csv_path):
            print(f"❌ HATA: Dosya bulunamadı: {csv_path}")
            return None
        
        try:
            # Encoding denemesi
            try:
                df = pd.read_csv(csv_path, encoding='utf-8')
                print("✅ UTF-8 encoding ile başarıyla okundu")
            except UnicodeDecodeError:
                df = pd.read_csv(csv_path, encoding='latin-1')
                print("✅ Latin-1 encoding ile başarıyla okundu")
        except Exception as e:
            print(f"❌ CSV okuma hatası: {e}")
            return None
        
        print(f"Toplam satır sayısı: {len(df)}")
        print(f"Sütunlar: {df.columns.tolist()}")
        
        # Boş chunk_text'leri filtrele
        initial_count = len(df)
        df = df.dropna(subset=['chunk_text'])
        df = df[df['chunk_text'].str.strip() != '']
        final_count = len(df)
        
        print(f"Boş metin filtrelemesi: {initial_count} -> {final_count}")
        
        # Metinleri temizle
        df['chunk_text_clean'] = df['chunk_text'].apply(self.clean_text)
        
        return df

    def create_embeddings_batch(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
        """Metinleri batch halinde embedding'e çevir"""
        embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding oluşturuluyor"):
            batch = texts[i:i+batch_size]
            batch_embeddings = self.model.encode(
                batch, 
                convert_to_numpy=True,
                show_progress_bar=False,
                normalize_embeddings=True  # Cosine similarity için normalize et
            )
            embeddings.extend(batch_embeddings.tolist())
            print(embeddings)
        
        return embeddings

    def upload_to_qdrant(self, df: pd.DataFrame, batch_size: int = 100):
        """DataFrame'i Qdrant'a yükle"""
        print("Qdrant'a yükleme başlıyor...")
        
        # Embeddings oluştur
        texts = df['chunk_text_clean'].tolist()
        embeddings = self.create_embeddings_batch(texts, batch_size=32)
        
        # Points oluştur
        points = []
        for idx, (_, row) in enumerate(df.iterrows()):
            payload = {
                "document_id": str(row['_id']),
                "location": str(row['location']),
                "esas_no": str(row['esasNo']),
                "karar_no": str(row['kararNo']),
                "dates": str(row['extractedDates']),
                "daire": str(row['daire']),
                "mahkeme": str(row['mahkeme']),
                "karar_turu": str(row['karar_turu']),
                "chunk_id": str(row['chunk_id']),
                "chunk_index": int(row['chunk_index']),
                "total_chunks": int(row['total_chunks']),
                "chunk_text": str(row['chunk_text_clean']),
                "chunk_length": int(row['chunk_length'])
            }
            
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=embeddings[idx],
                payload=payload
            )
            points.append(point)
        
        # Batch halinde yükle
        for i in tqdm(range(0, len(points), batch_size), desc="Qdrant'a yükleniyor"):
            batch_points = points[i:i+batch_size]
            
            try:
                self.client.upsert(
                    collection_name=self.collection_name,
                    points=batch_points
                )
            except Exception as e:
                print(f"Yükleme hatası (batch {i//batch_size + 1}): {e}")
                continue
        
        print(f"Toplam {len(points)} doküman başarıyla yüklendi!")

    def search(self, query: str, limit: int = 5, score_threshold: float = 0.5) -> List[Dict[str, Any]]:
        """Semantik arama yap"""
        print(f"Arama yapılıyor: '{query}'")
        
        # Query embedding'i oluştur
        query_embedding = self.model.encode(
            [query], 
            convert_to_numpy=True,
            normalize_embeddings=True
        )[0].tolist()
        
        # Arama yap (query_points kullan)
        search_results = self.client.query_points(
            collection_name=self.collection_name,
            query=query_embedding,
            limit=limit,
            score_threshold=score_threshold
        )
        
        results = []
        for result in search_results.points:
            results.append({
                "score": result.score,
                "payload": result.payload
            })
        
        return results

    def advanced_search(self, query: str, filters: Dict = None, limit: int = 5) -> List[Dict[str, Any]]:
        """Gelişmiş filtreleme ile arama"""
        query_embedding = self.model.encode(
            [query], 
            convert_to_numpy=True,
            normalize_embeddings=True
        )[0].tolist()
        
        # Filter oluştur
        filter_conditions = None
        if filters:
            conditions = []
            
            if 'daire' in filters:
                conditions.append(models.FieldCondition(
                    key="daire",
                    match=models.MatchValue(value=filters['daire'])
                ))
            
            if 'karar_turu' in filters:
                conditions.append(models.FieldCondition(
                    key="karar_turu", 
                    match=models.MatchValue(value=filters['karar_turu'])
                ))
            
            if 'year' in filters:
                conditions.append(models.FieldCondition(
                    key="dates",
                    match=models.MatchText(text=str(filters['year']))
                ))
            
            if conditions:
                filter_conditions = models.Filter(must=conditions)
        
        # query_points kullan
        search_results = self.client.query_points(
            collection_name=self.collection_name,
            query=query_embedding,
            query_filter=filter_conditions,
            limit=limit
        )
        
        results = []
        for result in search_results.points:
            results.append({
                "score": result.score,
                "payload": result.payload
            })
        
        return results

    def get_collection_info(self):
        """Collection bilgilerini getir"""
        try:
            info = self.client.get_collection(collection_name=self.collection_name)
            result = {
                "status": str(info.status),
                "vectors_count": info.vectors_count if hasattr(info, 'vectors_count') else 0,
            }
            
            # Mevcut attributeları kontrol et ve ekle
            if hasattr(info, 'segments_count'):
                result["segments_count"] = info.segments_count
            if hasattr(info, 'indexed_vectors_count'):
                result["indexed_vectors_count"] = info.indexed_vectors_count
            if hasattr(info, 'points_count'):
                result["points_count"] = info.points_count
                
            return result
        except Exception as e:
            return {"error": str(e)}

In [8]:
class RAGEvaluator:
    def __init__(self, csv_path: str):
        """
        CSV'den verileri yükleyen ve RAG değerlendirmesi yapan sınıf.
        """
        self.csv_path = csv_path
        self.df = None
        self.dataset = None

    def load_csv(self):
        """CSV dosyasını oku ve gerekli sütunları hazırla"""
        print(f"CSV okunuyor: {self.csv_path}")
        self.df = pd.read_csv(self.csv_path)
        print(f"Toplam satır: {len(self.df)}")

        # Context sütunu oluştur
        self.df["contexts"] = self.df.apply(
            lambda row: [
                f"Daire: {row['daire']}",
                f"Mahkeme: {row['mahkeme']}",
                f"Karar Türü: {row['karar_turu']}"
            ],
            axis=1
        )

        # question ve ground_truth sütunları oluştur
        self.df["question"] = self.df["chunk_text"]  # Burayı gerçek soru ile değiştirilebilir
        self.df["ground_truth"] = self.df["esasNo"].astype(str)  # veya kararNo

    def prepare_dataset(self):
        """Pandas DataFrame'i HuggingFace Dataset formatına çevir"""
        if self.df is None:
            raise ValueError("CSV yüklenmemiş. Önce load_csv() çağır.")
        
        self.dataset = Dataset.from_pandas(
            self.df[["question", "chunk_text", "contexts", "ground_truth"]].rename(
                columns={"chunk_text": "answer"}
            )
        )
        print("Dataset hazır.")

    def evaluate_rag(self):
        """RAG değerlendirmesini çalıştır"""
        if self.dataset is None:
            raise ValueError("Dataset hazırlanmadı. Önce prepare_dataset() çağır.")
        
        print("RAG değerlendirmesi başlatılıyor...")
        results = evaluate(
            self.dataset,
            metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
        )
        return results


def main():
    CSV_FILE = "/home/yapayzeka/ahsen_bulbul/model/langchain/recursive/yargitay_chunks.csv"

    # Qdrant veri tabanını hazırla (önceden yazdığın LegalDocumentVectorDB)
    db = LegalDocumentVectorDB(
        qdrant_url=qdrant_url,
        api_key=api_key,
        collection_name="hukuki_kararlar"
    )
    db.create_collection(recreate=True)
    df = db.process_csv(CSV_FILE)
    if df is not None:
        db.upload_to_qdrant(df, batch_size=50)

    # Collection bilgilerini göster
    info = db.get_collection_info()
    print("\n=== Qdrant Collection Bilgileri ===")
    print(json.dumps(info, indent=2, ensure_ascii=False))

    # Örnek aramalar
    print("\n=== Örnek Aramalar ===")
    results = db.search("ihtiyati tedbir tazminat", limit=3, score_threshold=0.6)
    for i, result in enumerate(results, 1):
        print(f"\n{i}. Sonuç (Skor: {result['score']:.3f})")
        print(f"   Daire: {result['payload']['daire']}")
        print(f"   Esas No: {result['payload']['esas_no']}")
        print(f"   Karar No: {result['payload']['karar_no']}")
        print(f"   Metin: {result['payload']['chunk_text'][:200]}...")

    # RAG Değerlendirmesi
    evaluator = RAGEvaluator(CSV_FILE)
    evaluator.load_csv()
    evaluator.prepare_dataset()
    rag_results = evaluator.evaluate_rag()

    print("\n=== RAG Değerlendirme Sonuçları ===")
    for metric, score in rag_results.items():
        print(f"{metric}: {score:.4f}")


if __name__ == "__main__":
    main()

Sentence Transformer modeli yükleniyor...
Model başarıyla yüklendi!
Vector boyutu: 384
Eski collection 'hukuki_kararlar' silindi.
Collection 'hukuki_kararlar' oluşturuldu.
CSV dosyası okunuyor: /home/yapayzeka/ahsen_bulbul/model/langchain/recursive/yargitay_chunks.csv
✅ UTF-8 encoding ile başarıyla okundu
Toplam satır sayısı: 137
Sütunlar: ['_id', 'location', 'esasNo', 'kararNo', 'extractedDates', 'esasNo_num', 'esasNo_tip', 'kararNo_num', 'kararNo_tip', 'daire', 'mahkeme', 'karar_turu', 'chunk_id', 'chunk_index', 'total_chunks', 'chunk_text', 'chunk_length']
Boş metin filtrelemesi: 137 -> 137
Qdrant'a yükleme başlıyor...


Embedding oluşturuluyor:  60%|██████    | 3/5 [00:00<00:00, 26.70it/s]

[[-0.041067663580179214, 0.06863519549369812, -0.025075774639844894, -0.06259750574827194, -0.08664493262767792, 0.01681516133248806, 0.028734521940350533, 0.06398266553878784, 0.05048501119017601, 0.038064613938331604, 0.031221207231283188, -0.02561652660369873, 0.02593233250081539, 0.029812514781951904, -0.005251473281532526, 0.019011443480849266, 0.003325950587168336, -0.07138263434171677, -0.03624581918120384, 0.0008525234879925847, -0.0012228427221998572, -0.011073645204305649, -0.10230793058872223, -0.0915178656578064, 0.026138214394450188, -0.06284383684396744, 0.024419661611318588, -0.009784658439457417, 0.016267582774162292, -0.01888042874634266, -0.057067036628723145, 0.029935890808701515, -0.02344777248799801, 0.015788359567523003, 0.00569051131606102, -0.061686668545007706, 0.05239582434296608, -0.009352656081318855, -0.016751157119870186, 0.009866069070994854, -0.037226445972919464, 0.01884128339588642, 6.380413833539933e-05, 0.08587002754211426, 0.049117740243673325, 0.05

Embedding oluşturuluyor: 100%|██████████| 5/5 [00:00<00:00, 24.30it/s]


[[-0.041067663580179214, 0.06863519549369812, -0.025075774639844894, -0.06259750574827194, -0.08664493262767792, 0.01681516133248806, 0.028734521940350533, 0.06398266553878784, 0.05048501119017601, 0.038064613938331604, 0.031221207231283188, -0.02561652660369873, 0.02593233250081539, 0.029812514781951904, -0.005251473281532526, 0.019011443480849266, 0.003325950587168336, -0.07138263434171677, -0.03624581918120384, 0.0008525234879925847, -0.0012228427221998572, -0.011073645204305649, -0.10230793058872223, -0.0915178656578064, 0.026138214394450188, -0.06284383684396744, 0.024419661611318588, -0.009784658439457417, 0.016267582774162292, -0.01888042874634266, -0.057067036628723145, 0.029935890808701515, -0.02344777248799801, 0.015788359567523003, 0.00569051131606102, -0.061686668545007706, 0.05239582434296608, -0.009352656081318855, -0.016751157119870186, 0.009866069070994854, -0.037226445972919464, 0.01884128339588642, 6.380413833539933e-05, 0.08587002754211426, 0.049117740243673325, 0.05

Qdrant'a yükleniyor: 100%|██████████| 3/3 [00:00<00:00, 29.23it/s]

Toplam 137 doküman başarıyla yüklendi!

=== Qdrant Collection Bilgileri ===
{
  "status": "green",
  "vectors_count": null,
  "segments_count": 8,
  "indexed_vectors_count": 0,
  "points_count": 137
}

=== Örnek Aramalar ===
Arama yapılıyor: 'ihtiyati tedbir tazminat'

1. Sonuç (Skor: 0.682)
   Daire: 6. Hukuk Dairesi
   Esas No: 2022/3281 E.
   Karar No: 2024/117 K.
   Metin: . maddesi, ihtiyati tedbir kararının haksız olduğunun belirlenmesi halinde tedbir kararı yüzünden uğranılan zararın tazminini düzenlediğini, ihtiyati tedbir kararını icra ettiren tarafın yasal sürede ...

2. Sonuç (Skor: 0.641)
   Daire: 6. Hukuk Dairesi
   Esas No: 2022/3281 E.
   Karar No: 2024/117 K.
   Metin: . maddesi "Aksi takdirde ihtiyati tedbir bir gûna merasime hacet kalmaksızın kendiliğinden kalkar ve iktizasına göre vazolunan tedbirin fiilen kaldırılması ihtiyati tedbiri tatbik eden daire veya memu...
CSV okunuyor: /home/yapayzeka/ahsen_bulbul/model/langchain/recursive/yargitay_chunks.csv
Toplam satır




OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable