In [None]:
import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from qdrant_client. models import  Prefetch, FusionQuery, Fusion, SparseVector
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Optional, Tuple
import os
from qdrant_client import models
from dataclasses import dataclass
import json
from qdrant_client.models import ScoredPoint
from dotenv import load_dotenv
import torch
from sentence_transformers import SentenceTransformer
from qdrant_client.http.models import NamedVector, NamedSparseVector, SparseVector, SearchRequest
from fastembed import SparseTextEmbedding, SparseEmbedding
from config import Config
from config import Models, model
from typing import List, Dict
print(load_dotenv("/home/ahsen/Masaüstü/stajProjesi/2025summerInternshipProject/qdrant/.env"))

  from .autonotebook import tqdm as notebook_tqdm


True


In [None]:
class ModelManager:
    
    def __init__(self, selected_model: str, runtime_config: Config):
        self.selected_model = selected_model
        self.runtime_config = runtime_config
        self.model_config = getattr(model, str(selected_model))  # config.py’deki model nesnesi
        self.model = None
        self.vectorizer = None

        
    def load_model(self):
        model_name = self.model_config.model_name
        model_type = self.model_config.model_type
        print(f"🔮 Model yükleniyor: {model_name} ({model_type})")

        if model_type == "bge":
            self.model = BGEM3FlagModel(
                model_name,
                use_fp16=self.model_config.USE_FP16,
                device=self.model_config.DEVICE
            )
        elif model_type == "sentence_transformer":
            self.model = SentenceTransformer(model_name, device=self.model_config.DEVICE)
            
        else:
            raise ValueError(f"Desteklenmeyen model tipi: {model_type}")

        print(f"✅ Model yüklendi: {model_name}")
        return True

    
    def encode_texts(model_manager, texts, target_dim=512):
        # Dense embedding
        result = model_manager.model.encode(texts, return_dense=True, return_sparse=True)
        dense_embeddings = result.get("dense_vecs", [])
        #dense_embeddings = model_manager.model.encode(texts, convert_to_numpy=True).tolist()
        dense_embeddings = [d[:target_dim] for d in dense_embeddings]  # truncate 512
        sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")
        # Sparse embedding
        sparse_embeddings_raw = list(sparse_model.embed(texts, batch_size=100))
        sparse_embeddings = []
        for s in sparse_embeddings_raw:
            sparse_embeddings.append({
                "indices": s.indices.tolist(),
                "values": s.values.tolist()
            })

        return dense_embeddings, sparse_embeddings
    
    


    def get_model_info(self) -> Dict:
        return {
            "model_name": self.model_config.model_name,
            "model_type": self.model_config.model_type,
            "embedding_dim": self.model_config.embedding_dim,
            "description": self.model_config.description,
            "loaded": self.model is not None
        }

In [None]:



class YargitaySemanticProcessor:

    def __init__(self, runtime_config: Config, selected_model: str):
        self.runtime_config = runtime_config
        self.model_manager = ModelManager(selected_model, runtime_config)
        self.model_manager.load_model()
        
        self.encoding = tiktoken.get_encoding("cl100k_base")
        self.chunker = semchunk.chunkerify(self.encoding, runtime_config.TOKEN_SIZE)

        self.qdrant_client = QdrantClient(url=runtime_config.QDRANT_URL)

        model_name = self.runtime_config.SPARSE_MODEL 
        # This triggers the model download
        self.sparse_model = SparseTextEmbedding(model_name=model_name)

    def create_qdrant_collection(self, recreate: bool = True):
        collection_name = Config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense + Sparse (sparse için yine 512 dim)
                vectors_config = {
                    "dense_vec": models.VectorParams(size=self.runtime_config.embedding_dim, distance=models.Distance.COSINE),
                }
                sparse_config = {
                    "sparse_vec": models.SparseVectorParams(
                        index=models.SparseIndexParams(on_disk=False))
                }
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config,
                    sparse_vectors_config = sparse_config
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} (Dense+Sparse)")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        chunks = self.chunker(text)
        result = []
        for i, c in enumerate(chunks):
            if c.strip():
                cd = {
                    'chunk_id': i,
                    'text': c.strip(),
                    'token_count': len(self.encoding.encode(c)),
                    'char_count': len(c)
                }
                if metadata:
                    cd.update(metadata)
                result.append(cd)
        return result


    def process_csv_file(self, csv_path: str = "/home/ahsen/Masaüstü/stajProjesi/2025summerInternshipProject/data/cleaned10chunk.csv") -> List[Dict]:
        df = pd.read_csv(csv_path)
        text_column = next((c for c in ['rawText','chunk_text','text','content','metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
        
        
        return all_chunks

    def create_embeddings(self, texts: List[str], batch_size: int = None):
        """Dinamik model ile embedding oluştur"""
        batch_size = batch_size or self.runtime_config.BATCH_SIZE
        
        all_embeddings_dense, all_embeddings_sparse = [], []
        total = len(texts)
        print(f"🔮 {total} metin işleniyor (model: {self.runtime_config.model_name})...")

        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            try:
                dense, sparse = self.model_manager.encode_texts(batch_texts)
                all_embeddings_dense.extend(dense)
                all_embeddings_sparse.extend(sparse)
                
                print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")
                
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
            except Exception as e:
                print(f"❌ Embedding hatası (batch {i//batch_size+1}): {e}")
                # Fallback
                all_embeddings_dense.extend([[0.0]*self.runtime_config.embedding_dim for _ in batch_texts])
                all_embeddings_sparse.extend([{"indices": [], "values": []} for _ in batch_texts])

        return all_embeddings_dense, all_embeddings_sparse

    
    

    def upload_to_qdrant(self, chunks: List[Dict]):
        dense_embeddings, sparse_embeddings = self.model_manager.encode_texts([c["text"] for c in chunks])
        
        points = []
        for c, d, s in zip(chunks, dense_embeddings, sparse_embeddings):
            vector_dict = {"dense_vec": d[:512]}
            
            if s is not None:
                # s artık dict formatında: {"indices": [...], "values": [...]}
                indices = s.get("indices", [])
                values = s.get("values", [])
                
                if len(indices) > 0:
                    vector_dict["sparse_vec"] = SparseVector(indices=indices, values=values)
            
            points.append(PointStruct(id=str(uuid.uuid4()), vector=vector_dict, payload=c))
        
        # batch upload
        batch_size = self.runtime_config.BATCH_SIZE
        for i in range(0, len(points), batch_size):
            self.qdrant_client.upsert(
                collection_name=Config.COLLECTION_NAME,
                points=points[i:i+batch_size]
            )
        print(f"✅ {len(points)} noktalar Qdrant'a yüklendi!")


In [4]:
def select_model() -> str:
    print("🤖 Model Seçimi:")
    for name in vars(model):
        m = getattr(model, name)
        print(f"{name}: {m.description} (Dim: {m.embedding_dim})")
    choice = input("Model seçin (default bge_m3): ").strip() or "bge_m3"
    if choice not in vars(model):
        choice = "bge_m3"
    return choice


In [5]:
selected_model = select_model()
selected_model

🤖 Model Seçimi:
bge_m3: BGE-M3 - Çok dilli, dense+sparse embedding destekli (Dim: 1024)
bge_large: BGE Large - Sadece dense embedding (Dim: 1024)
multilingual_e5: E5 Multilingual Large - Çok dilli dense embedding (Dim: 1024)
turkish_bert: Turkish BERT - Türkçe özelleştirilmiş (Dim: 768)
distilbert_turkish: Hızlı Türkçe DistilBERT (Dim: 768)
all_mpnet: All-MiniLM - Genel amaçlı, hızlı (Dim: 768)


'bge_m3'

In [6]:
print(selected_model, type(selected_model))
selected_config = getattr(model, selected_model)
selected_config

bge_m3 <class 'str'>


Config(SPARSE_MODEL='Qdrant/bm25', USE_FP16=True, DEVICE='cpu', TOKEN_SIZE=512, ENCODING_NAME='cl100k_base', QDRANT_URL='http://localhost:6333', COLLECTION_NAME='deneme', CSV_FILE='/home/ahsen/Masaüstü/stajProjesi/2025summerInternshipProject/data/cleaned10chunk.csv', BATCH_SIZE=100, DB_BATCH=256, model_name='BAAI/bge-m3', model_type='bge', embedding_dim=1024, max_seq_length=8192, description='BGE-M3 - Çok dilli, dense+sparse embedding destekli')

In [7]:
processor = YargitaySemanticProcessor(Config, selected_model)
processor.create_qdrant_collection(recreate=True)

🔮 Model yükleniyor: BAAI/bge-m3 (bge)


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 143150.31it/s]


✅ Model yüklendi: BAAI/bge-m3


Fetching 18 files: 100%|██████████| 18/18 [00:01<00:00, 15.08it/s]


🗑️ Eski koleksiyon silindi: deneme
✅ Koleksiyon oluşturuldu: deneme (Dense+Sparse)


In [8]:
chunks = processor.process_csv_file(Config.CSV_FILE)
processor.upload_to_qdrant(chunks)
print("✅ Pipeline tamamlandı!")


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ 59 noktalar Qdrant'a yüklendi!
✅ Pipeline tamamlandı!


In [9]:
model_manager = ModelManager(selected_model, selected_config)
model_manager.load_model()

qdrant_client = QdrantClient(url=Config.QDRANT_URL)
qdrant_client

🔮 Model yükleniyor: BAAI/bge-m3 (bge)


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 95252.93it/s]


✅ Model yüklendi: BAAI/bge-m3


<qdrant_client.qdrant_client.QdrantClient at 0x7f4e97262aa0>

In [8]:
query = "ihtiyati tedbir taazminat nedir"

In [11]:
dense_emb = model_manager.encode_texts([query])
dense_emb

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


([array([-2.88481135e-02,  6.78069005e-03, -5.75112691e-03, -3.40650715e-02,
         -2.82505136e-02, -2.20167357e-02,  5.81906773e-02,  7.58740027e-03,
         -6.67017186e-03, -3.26118544e-02, -3.48149016e-02,  1.15371337e-02,
         -7.64328316e-02,  2.24253684e-02,  1.14032095e-02,  2.03358214e-02,
          4.02332060e-02,  1.02652507e-02,  3.65760177e-02, -4.24560234e-02,
         -3.14213037e-02, -2.76389010e-02, -4.19816701e-03, -1.05831483e-02,
         -2.48087626e-02,  2.93866638e-02,  2.91311257e-02,  2.21866872e-02,
          3.17512564e-02, -3.50190923e-02, -1.26664876e-03, -5.46354381e-03,
         -8.19494762e-03, -1.67326233e-03, -1.99179575e-02,  4.12217416e-02,
         -1.61503553e-02, -7.26180598e-02, -2.40964871e-02,  2.49175783e-02,
          1.20750535e-02,  4.71724989e-03,  2.31970730e-03, -1.39305284e-02,
          2.21467521e-02, -4.28945497e-02,  5.39295655e-03, -2.67709829e-02,
         -4.82691545e-03, -4.37719114e-02, -2.15039607e-02, -6.11547641e-02,

In [10]:
def search_semantic(query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
        model_manager = ModelManager(selected_model, selected_config)
        model_manager.load_model()
        qdrant_client = QdrantClient(url=Config.QDRANT_URL)
        """Dense semantic search"""

        try:
            
            dense_emb,sparse_emb = model_manager.encode_texts([query])
            query_vector = dense_emb[0][:512]
            sparse_emb=None
            
            qr = qdrant_client.query_points(
                collection_name=Config.COLLECTION_NAME,
                query=query_vector, 
                using="dense_vec",
                limit=5
            )
            

            results = [{"score": p.score, "payload": p.payload} for p in qr.points]
            print(f"📊 {len(results)} sonuç bulundu (Dense only)")
            return results

        except Exception as e:
            print(f"❌ Semantic search hatası: {e}")
            return []

In [11]:
import torch

print("CUDA mevcut mu:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU cihaz sayısı:", torch.cuda.device_count())
    print("Şu an kullanılan cihaz:", torch.cuda.current_device())
    print("Cihaz ismi:", torch.cuda.get_device_name(torch.cuda.current_device()))


CUDA mevcut mu: False


In [None]:
def search_hybrid(query: str, limit: int = 10, score_threshold: float = None) -> List[Dict]:
    model_manager = ModelManager(selected_model, selected_config)
    model_manager.load_model()
    qdrant_client = QdrantClient(url=Config.QDRANT_URL)
    """Hybrid search (Dense + Sparse)"""

    try:
        dense_emb,sparse_emb = model_manager.encode_texts([query])
        #query_vector = dense_emb[0][:512]
        

        s = sparse_emb[0]
        query_sparse_vector = SparseVector(indices=s["indices"], values=s["values"])
        qr=qdrant_client.query_points(
            collection_name=Config.COLLECTION_NAME,
            prefetch=[
                models.Prefetch(
                    query=query_sparse_vector,  # sparse vector
                    using="sparse_vec",
                    limit=5,
                ),
                models.Prefetch(
                    query=dense_emb[0],  # <-- dense vector
                    using="dense_vec",
                    limit=20,
                ),
            ],
            query=models.FusionQuery(fusion=models.Fusion.RRF),
        )
            
        results = [{"score": p.score, "payload": p.payload} for p in qr.points]
        print(f"📊 {len(results)} sonuç bulundu ")
        return results
        
    except Exception as e:
        print(f"❌ Hybrid search hatası: {e}")
        return []

In [12]:
query="ihtiyati tedbir tazminat "

In [None]:
model_manager = ModelManager(selected_model, selected_config)
model_manager.load_model()
qdrant_client = QdrantClient(url=Config.QDRANT_URL)

In [None]:
dense_emb,sparse_emb = model_manager.encode_texts([query])
query_vector = dense_emb[0][:512]
sparse_emb

In [13]:
def search():
    print("\n🔎 İnteraktif arama başlatıldı")
    print(processor.model_manager.get_model_info())
    
    while True:
            
            print(f"\n{'='*50}")
            print("🔍 ARAMA SEÇENEKLERİ")
            print("1-only dense")
            print("2-dense + sparse")
            choice = input("Seçiminiz (1/2/3, çıkmak için q): ").strip()
            if choice.lower() == 'q':
                print("Çıkılıyor...")
                break
            if choice not in ['1', '2', '3']:
                print("Geçersiz seçim, tekrar deneyin.")
                continue
            query = input("Arama sorgusu girin: ").strip()
            if not query:
                print("Boş sorgu, tekrar deneyin.")
                continue
            if choice == '1':
                results = search_semantic(query, limit=10, score_threshold=0.6)
            elif choice == '2':
                results = search_hybrid(query, limit=10, score_threshold=0.6)
            print(f"\n📊 {len(results)} sonuç bulundu:")
            
            for idx, r in enumerate(results, 1):
                print(f"{idx}. Score: {r['score']:.4f}, Text: {r['payload'].get('text','')[:200]}...")  # ilk 200 karakter




In [None]:
search()
print(search())



🔎 İnteraktif arama başlatıldı
{'model_name': 'BAAI/bge-m3', 'model_type': 'bge', 'embedding_dim': 1024, 'description': 'BGE-M3 - Çok dilli, dense+sparse embedding destekli', 'loaded': True}

🔍 ARAMA SEÇENEKLERİ
1-only dense
2-dense + sparse
🔮 Model yükleniyor: BAAI/bge-m3 (bge)


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 137218.23it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Model yüklendi: BAAI/bge-m3
📊 10 sonuç bulundu 

📊 10 sonuç bulundu:
1. Score: 1.0000, Text: Bölge Adliye Mahkemesinin yukarıda belirtilen kararına karşı süresi içinde davalı ... vekili temyiz isteminde bulunmuştur. 2. Dairemizin 17.02.2022 tarihli ve 2021/2532 Esas, 2022/901 Karar sayılı ila...
2. Score: 0.4333, Text: arasında arsa payı karşılığı inşaat sözleşmesi imzalandığını, sözleşme uyarınca arsa sahibi davacıya ait 2 ve 3 no.lu parsellerde bulunan payların davacı yüklenici şirkete devri karşılığı, yüklenici ş...
3. Score: 0.4000, Text: kişilere devretse bile 3. kişi ve daha sonraki devralanların iyiniyet savunmasında bulunmasının mümkün olmadığını, davalı ...’ın yükleniciye kat karşılığı inşaat sözleşmesi gereği avans olarak verilmi...
4. Score: 0.3333, Text: 6. Hukuk Dairesi 2022/3281 E. , 2024/117 K. \n "İçtihat Metni" MAHKEMESİ :Asliye Hukuk Mahkemesi Taraflar arasındaki tazminat davasından dolayı yapılan yargılama sonunda İlk Derece Mahkemesince davanı...
5. Score: 0.2500,

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 135591.72it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Model yüklendi: BAAI/bge-m3
📊 5 sonuç bulundu (Dense only)

📊 5 sonuç bulundu:
1. Score: 0.6177, Text: Bölge Adliye Mahkemesinin yukarıda belirtilen kararına karşı süresi içinde davalı ... vekili temyiz isteminde bulunmuştur. 2. Dairemizin 17.02.2022 tarihli ve 2021/2532 Esas, 2022/901 Karar sayılı ila...
2. Score: 0.6098, Text: 6. Hukuk Dairesi 2022/3281 E. , 2024/117 K. \n "İçtihat Metni" MAHKEMESİ :Asliye Hukuk Mahkemesi Taraflar arasındaki tazminat davasından dolayı yapılan yargılama sonunda İlk Derece Mahkemesince davanı...
3. Score: 0.6022, Text: vekili cevap dilekçesinde özetle; ....A.Ş.'nin diğer davalı şirket ile yapmış olduğu sözleşme çerçevesinde davacıya dava konusu villanın satıldığını ve teslim edildiğini, dava konusu villanın tapusunu...
4. Score: 0.5953, Text: kişilere devretse bile 3. kişi ve daha sonraki devralanların iyiniyet savunmasında bulunmasının mümkün olmadığını, davalı ...’ın yükleniciye kat karşılığı inşaat sözleşmesi gereği avans olarak verilmi...
5. Scor

In [None]:
ihtiyati tedbir tazminat