In [None]:
import pandas as pd
import tiktoken
import semchunk
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, HnswConfigDiff
from qdrant_client. models import  Prefetch, FusionQuery, Fusion, SparseVector
from FlagEmbedding import BGEM3FlagModel
import numpy as np
import uuid
from typing import List, Dict, Optional, Tuple
import os
from qdrant_client import models
from dataclasses import dataclass
import json
from dotenv import load_dotenv
import torch
from sentence_transformers import SentenceTransformer
from qdrant_client.http.models import NamedVector, NamedSparseVector, SparseVector, SearchRequest
from fastembed import SparseTextEmbedding, SparseEmbedding
from config import Config
from config import Models, model
print(load_dotenv("/home/yapayzeka/ahsen_bulbul/qdrant/.env"))

In [None]:
class ModelManager:

    def __init__(self, selected_model: str, runtime_config: Config):
        self.selected_model = selected_model
        self.runtime_config = runtime_config
        self.model_config = getattr(model, selected_model)  # config.py’deki model nesnesi
        self.model = None
        self.vectorizer = None

    def load_model(self):
        model_name = self.model_config.model_name
        model_type = self.model_config.model_type
        print(f"🔮 Model yükleniyor: {model_name} ({model_type})")

        if model_type == "bge":
            self.model = BGEM3FlagModel(
                model_name,
                use_fp16=self.model_config.USE_FP16,
                device=self.model_config.DEVICE
            )
        elif model_type == "sentence_transformer":
            self.model = SentenceTransformer(model_name, device=self.model_config.DEVICE)
            # TF-IDF sparse embedding
            self.vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
        else:
            raise ValueError(f"Desteklenmeyen model tipi: {model_type}")

        print(f"✅ Model yüklendi: {model_name}")
        return True

    def encode_texts(self, texts: List[str]) -> Tuple[List[List[float]], List[Dict]]:
        dense_embeddings, sparse_embeddings = [], []

        if self.model_config.model_type == "bge" and hasattr(self.model, 'encode'):
            result = self.model.encode(texts, return_dense=True, return_sparse=True)
            dense_embeddings = result.get("dense_vecs", [])
            sparse_raw = result.get("sparse_vecs", [])
            sparse_embeddings = [
                {"indices": s.get("indices", []), "values": s.get("values", [])}
                for s in sparse_raw
            ] if sparse_raw else [{"indices": [], "values": []} for _ in texts]
        else:
            dense_embeddings = self.model.encode(texts, convert_to_numpy=True).tolist()
            if self.vectorizer:
                X_sparse = self.vectorizer.fit_transform(texts)
                sparse_embeddings = []
                for i in range(X_sparse.shape[0]):
                    row = X_sparse[i].tocoo()
                    sparse_embeddings.append({
                        "indices": row.col.tolist(),
                        "values": row.data.tolist()
                    })
            else:
                sparse_embeddings = [{"indices": [], "values": []} for _ in texts]

        # Embedding boyutunu runtime config’e göre ayarla
        target_dim = self.runtime_config.EMBEDDING_DIM
        dense_clean = []
        for vec in dense_embeddings:
            if vec is None:
                dense_clean.append([0.0] * target_dim)
            elif len(vec) < target_dim:
                dense_clean.append(vec + [0.0] * (target_dim - len(vec)))
            else:
                dense_clean.append(vec[:target_dim])

        return dense_clean, sparse_embeddings

    def get_model_info(self) -> Dict:
        return {
            "model_name": self.model_config.model_name,
            "model_type": self.model_config.model_type,
            "embedding_dim": self.model_config.embedding_dim,
            "description": self.model_config.description,
            "loaded": self.model is not None
        }

In [None]:
class YargitaySemanticProcessor:

    def __init__(self, runtime_config: Config, selected_model: str):
        self.runtime_config = runtime_config
        self.model_manager = ModelManager(selected_model, runtime_config)
        self.model_manager.load_model()

        self.encoding = tiktoken.get_encoding("cl100k_base")
        self.chunker = semchunk.chunkerify(self.encoding, runtime_config.TOKEN_SIZE)

        self.qdrant_client = QdrantClient(url=runtime_config.QDRANT_URL)

    def create_qdrant_collection(self, recreate: bool = True):
        collection_name = Config.COLLECTION_NAME
        if recreate:
            try:
                self.qdrant_client.delete_collection(collection_name)
                print(f"🗑️ Eski koleksiyon silindi: {collection_name}")
            except Exception:
                pass

        try:
            existing = [c.name for c in self.qdrant_client.get_collections().collections]
            if collection_name not in existing:
                # Dense + Sparse (sparse için yine 512 dim)
                vectors_config = {
                    "dense_vec": models.VectorParams(size=self.config.EMBEDDING_DIM, distance=models.Distance.COSINE),
                }
                sparse_config = {
                    "sparse_vec": models.SparseVectorParams(
                        index=models.SparseIndexParams(on_disk=False))
                }
                self.qdrant_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=vectors_config,
                    sparse_vectors_config = sparse_config
                )
                print(f"✅ Koleksiyon oluşturuldu: {collection_name} (Dense+Sparse)")
            else:
                print(f"ℹ️ Koleksiyon zaten var: {collection_name}")
        except Exception as e:
            print(f"❌ Koleksiyon oluşturma hatası: {e}")
            raise

    def semantic_chunk_text(self, text: str, metadata: dict = None) -> List[Dict]:
        if not text or not text.strip():
            return []
        chunks = self.chunker(text)
        result = []
        for i, c in enumerate(chunks):
            if c.strip():
                cd = {
                    'chunk_id': i,
                    'text': c.strip(),
                    'token_count': len(self.encoding.encode(c)),
                    'char_count': len(c)
                }
                if metadata:
                    cd.update(metadata)
                result.append(cd)
        return result


    def process_csv_file(self, csv_path: str) -> List[Dict]:
        df = pd.read_csv(csv_path)
        text_column = next((c for c in ['rawText','chunk_text','text','content','metin'] if c in df.columns), None)
        if not text_column:
            print("❌ Ana metin sütunu bulunamadı")
            return []

        all_chunks = []
        for idx, row in df.iterrows():
            text = row.get(text_column, '')
            if not text or pd.isna(text):
                continue
            meta = {
                'original_index': idx,
                'esas_no': row.get('esasNo', ''),
                'karar_no': row.get('kararNo', ''),
                'daire': row.get('location', ''),
                'tarih': row.get('extractedDates', ''),
                'document_id': row.get('_id', ''),
            }
            chunks = self.semantic_chunk_text(str(text), meta)
            all_chunks.extend(chunks)
        
        
        return all_chunks

    def create_embeddings(self, texts: List[str], batch_size: int = None):
        batch_size = batch_size or self.runtime_config.BATCH_SIZE
        all_dense, all_sparse = [], []
        total = len(texts)
        for i in range(0, total, batch_size):
            batch_texts = texts[i:i + batch_size]
            emb_res = self.bge_model.encode(
                    batch_texts,
                    return_dense=True,
                    return_sparse=True
                )
            dense = emb_res.get("dense_vecs", [[0.0]*self.config.EMBEDDING_DIM for _ in batch_texts])

                # Dense içinde None veya kısa vektör varsa düzelt
            dense_clean = []
            for vec in dense:
                if vec is None:
                    dense_clean.append([0.0]*self.config.EMBEDDING_DIM)
                elif len(vec) < self.config.EMBEDDING_DIM:
                    dense_clean.append(vec + [0.0]*(self.config.EMBEDDING_DIM - len(vec)))
                else:
                    dense_clean.append(vec[:self.config.EMBEDDING_DIM])
    
            sparse_vectors = []
            for text in batch_texts:
                # SparseEmbedding üret
                sparse_embedding = list(model.embed(text))[0]  # ilk embedding
                sparse_vectors.append({
                    "indices": sparse_embedding.indices.tolist(),
                    "values": sparse_embedding.values.tolist()
                })
            # Listeye ekle
            all_dense.extend(dense_clean)
            all_sparse.extend(sparse_vectors)

            print(f"  📊 Batch işlendi: {i + len(batch_texts)}/{total}")

        return all_dense, all_sparse



    def upload_to_qdrant(self, chunks: List[Dict]):
        points = []
        dense_embeddings, sparse_embeddings = self.create_embeddings([c['text'] for c in chunks])
        for c, d, s in zip(chunks, dense_embeddings, sparse_embeddings):
            vector_dict = {"dense_vec": d}
            if s["indices"]:
                vector_dict["sparse_vec"] = SparseVector(indices=s["indices"], values=s["values"])
            points.append(PointStruct(id=str(uuid.uuid4()), vector=vector_dict, payload=c))

        batch = self.runtime_config.BATCH_SIZE
        for i in range(0, len(points), batch):
            self.qdrant_client.upsert(collection_name=self.runtime_config.COLLECTION_NAME, points=points[i:i+batch])


In [None]:
def select_model() -> str:
    print("🤖 Model Seçimi:")
    for name in vars(model):
        m = getattr(model, name)
        print(f"{name}: {m.description} (Dim: {m.embedding_dim})")
    choice = input("Model seçin (default bge_m3): ").strip() or "bge_m3"
    if choice not in vars(model):
        choice = "bge_m3"
    return choice


In [None]:
selected_model = select_model()
selected_model

In [None]:
selected_config = getattr(model, selected_model)
selected_config

In [None]:
processor = YargitaySemanticProcessor(Config, selected_model)
processor.create_qdrant_collection(recreate=True)

In [None]:
chunks = processor.process_csv_file(Config.CSV_FILE)
processor.upload_to_qdrant(chunks)
print("✅ Pipeline tamamlandı!")
