In [None]:
from chonkie import SemanticChunker

### bge-m3

In [None]:
semantic_chunker = SemanticChunker(
    embedding_model="BAAI/bge-m3",
    threshold=0.9,                               # Similarity threshold (0-1) or (1-100) or "auto"
    chunk_size=64,                              # Maximum tokens per chunk
    min_sentences=3                              # Initial sentences per chunk
)

semantic_chunks = semantic_chunker.chunk(text)

for chunk in semantic_chunks:
    print(f"Chunk text: {chunk.text}")
    print(f"Token count: {chunk.token_count}")
    print(f"Number of sentences: {len(chunk.sentences)}")

In [None]:
import pandas as pd
from chonkie import SemanticChunker

# CSV yükle
df = pd.read_csv("/home/yapayzeka/ahsen_bulbul/data/cleaned10chunk.csv")
texts = df['rawText'].astype(str).tolist()

# Tüm metni birleştir
full_text = "\n".join(texts)

# Chunker
semantic_chunker = SemanticChunker(
    embedding_model="BAAI/bge-m3",
    threshold=0.5,
    chunk_size=128,
    min_sentences=3
)

# Chunk’ları çıkar
semantic_chunks = semantic_chunker.chunk(full_text)

# Chunk bilgilerini DataFrame’e aktar
chunk_data = []
for i, chunk in enumerate(semantic_chunks, 1):
    chunk_data.append({
        "chunk_id": i,
        "chunk_text": chunk.text,
        "token_count": chunk.token_count,
        "num_sentences": len(chunk.sentences)
    })

chunk_df = pd.DataFrame(chunk_data)

# CSV olarak kaydet
chunk_df.to_csv("/home/yapayzeka/ahsen_bulbul/model/chonkie/semantic/chunks.csv", index=False, encoding="utf-8-sig")

print("Chunks CSV olarak kaydedildi.")



### LAST

In [None]:
import pandas as pd
from chonkie import SemanticChunker

# CSV yükle
df = pd.read_csv("/home/yapayzeka/ahsen_bulbul/data/10data.csv")

# Chunker
semantic_chunker = SemanticChunker(
    embedding_model="BAAI/bge-m3",
    threshold=0.8,
    chunk_size=300,
    min_sentences=5
)

chunk_data = []

for idx, row in df.iterrows():
    text = str(row['rawText'])
    chunks = semantic_chunker.chunk(text)
    
    for i, chunk in enumerate(chunks, 1):
        metadata = {
            "_id": row["_id"],
            "location": row["location"],
            "extractedDates": row["extractedDates"],
            "esasNo": row["esasNo"],
            "kararNo": row["kararNo"],
            "esasNo_num": row["esasNo_num"],
            "esasNo_tip": row["esasNo_tip"],
            "kararNo_num": row["kararNo_num"],
            "kararNo_tip": row["kararNo_tip"]
        }
        chunk_data.append({
            "chunk_id": f"{row['_id']}_{i}",   # Örn: _id_chunkIndex
            "chunk_text": chunk.text,
            "token_count": chunk.token_count,
            "num_sentences": len(chunk.sentences),
            **metadata
        })

chunk_df = pd.DataFrame(chunk_data)

chunk_df.to_csv(
    "/home/yapayzeka/ahsen_bulbul/model/chonkie/semantic/chunks_with_metadata.csv",
    index=False,
    encoding="utf-8-sig"
)

print("Chunks + metadata CSV olarak kaydedildi.")


In [None]:
import pandas as pd
from chonkie import SemanticChunker

# 1. CSV dosyasını yükle
df = pd.read_csv('/home/yapayzeka/ahsen_bulbul/data/10data.csv')

# 2. Chunker oluştur
chunker = SemanticChunker(
    embedding_model="BAAI/bge-m3",
    threshold=0.8,      # benzerlik eşiği
    chunk_size=512,     # her chunk max token
    min_sentences=5     # en az cümle sayısı
)

# 3. Chunk sonuçlarını tutmak için liste
chunked_data = []

# 4. Her satır için chunk işle
for index, row in df.iterrows():
    text_to_chunk = row['rawText']
    
    if pd.notna(text_to_chunk):
        # ⚠️ chunk alma → bazı versiyonlarda chunker(text), bazılarında chunker.chunk(text) gerekiyor
        try:
            chunks = chunker.chunk(text_to_chunk)   # önce bu denenir
        except:
            chunks = chunker(text_to_chunk)        # eğer olmazsa bu çalışır
        
        # 5. Her chunk için metadata ekle
        for i, chunk in enumerate(chunks):
            # chunk.text varsa onu al, yoksa direk chunk kullan
            chunk_text = getattr(chunk, "text", chunk)
            
            chunked_data.append({
                "original_id": row["_id"],
                "original_location": row["location"],
                "extractedDates": row["extractedDates"],
                "esasNo": row["esasNo"],
                "kararNo": row["kararNo"],
                "esasNo_num": row["esasNo_num"],
                "esasNo_tip": row["esasNo_tip"],
                "kararNo_num": row["kararNo_num"],
                "kararNo_tip": row["kararNo_tip"],
                "chunk_id": f"{row['_id']}-sem_{i+1}",
                "chunk_text": chunk_text,
                "token_count": getattr(chunk, "token_count", None),
                "num_sentences": len(getattr(chunk, "sentences", [])) if hasattr(chunk, "sentences") else None
            })

# 6. DataFrame oluştur
chunked_df = pd.DataFrame(chunked_data)

# 7. CSV olarak kaydet
output_file = '/home/yapayzeka/ahsen_bulbul/model/chonkie/chunked_data_semantic_with_metadata.csv'
chunked_df.to_csv(output_file, index=False, encoding="utf-8-sig")

print(f"✅ Metinler chunk’landı ve metadata ile birlikte '{output_file}' dosyasına kaydedildi.")


In [1]:
import pandas as pd
from chonkie import SemanticChunker
import re


# 1. CSV dosyasını yükle
df = pd.read_csv('/home/yapayzeka/ahsen_bulbul/data/10data.csv')

# 2. Chunker oluştur
chunker = SemanticChunker(
    embedding_model="BAAI/bge-m3",
    threshold=0.8,
    chunk_size=512,
    min_sentences=5
)

# 3. Chunk sonuçlarını tutmak için liste
chunked_data = []

# 4. Her satır için chunk işle
for index, row in df.iterrows():
    text_to_chunk = row['rawText']
    
    if pd.notna(text_to_chunk):
        try:
            chunks = chunker.chunk(text_to_chunk)
        except:
            chunks = chunker(text_to_chunk)
        
        # 4a. Küçük chunkları birleştirme
        merged_chunks = []
        token_threshold = 100  # 100 token’dan küçük chunk birleştirilecek

        for chunk in chunks:
            chunk_text = getattr(chunk, "text", chunk)
            token_count = getattr(chunk, "token_count", None)
            num_sentences = len(getattr(chunk, "sentences", [])) if hasattr(chunk, "sentences") else None

            if merged_chunks and token_count is not None and token_count < token_threshold:
                # Önceki chunk ile birleştir
                prev = merged_chunks[-1]
                prev["chunk_text"] += " " + chunk_text
                prev["token_count"] += token_count
                if num_sentences is not None:
                    prev["num_sentences"] += num_sentences
            else:
                merged_chunks.append({
                    "chunk_text": chunk_text,
                    "token_count": token_count,
                    "num_sentences": num_sentences
                })

        # 5. Metadata ekleyip chunked_data’ya aktar
        for i, mc in enumerate(merged_chunks, 1):
            chunked_data.append({
                "_id": row["_id"],
                "location": re.sub(r"(\d+)\. ?HukukDairesi", r"\1. Hukuk Dairesi", str(row["location"])),
                "extractedDates": row["extractedDates"],
                "esasNo": row["esasNo"],
                "kararNo": row["kararNo"],
                "esasNo_num": row["esasNo_num"],
                "esasNo_tip": row["esasNo_tip"],
                "kararNo_num": row["kararNo_num"],
                "kararNo_tip": row["kararNo_tip"],
                "chunk_id": f"{row['_id']}-sem_{i}",
                "chunk_text": mc["chunk_text"],
                "token_count": mc["token_count"],
                "num_sentences": mc["num_sentences"]
            })

# 6. DataFrame oluştur
chunked_df = pd.DataFrame(chunked_data)


# 7. CSV olarak kaydet
output_file = '/home/yapayzeka/ahsen_bulbul/model/chonkie/semantic/2semantic_with_metadata.csv'
chunked_df.to_csv(output_file, index=False, encoding="utf-8-sig")

print(f"✅ Metinler chunk’landı, küçük chunklar merge edildi ve CSV kaydedildi: '{output_file}'")


  from .autonotebook import tqdm as notebook_tqdm


✅ Metinler chunk’landı, küçük chunklar merge edildi ve CSV kaydedildi: '/home/yapayzeka/ahsen_bulbul/model/chonkie/semantic/2semantic_with_metadata.csv'
