In [2]:
import pandas as pd
import re
import json
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from datetime import datetime
import tiktoken

@dataclass
class ChunkMetadata:
    """Chunk metadata sƒ±nƒ±fƒ±"""
    document_id: str
    location: str
    section_type: str
    section_title: str
    chunk_index: int
    total_chunks: int
    tokens: int
    characters: int
    has_dates: bool
    has_legal_refs: bool
    case_numbers: List[str]
    dates: List[str]

class TurkishLegalChunker:
    """T√ºrk√ße hukuki metinler i√ßin √∂zelle≈ütirilmi≈ü chunker"""
    
    def __init__(self, 
                 target_tokens: int = 500, 
                 max_tokens: int = 800, 
                 min_tokens: int = 100,
                 overlap_ratio: float = 0.1):
        
        self.target_tokens = target_tokens
        self.max_tokens = max_tokens
        self.min_tokens = min_tokens
        self.overlap_ratio = overlap_ratio
        
        # Tokenizer
        self.encoding = tiktoken.get_encoding("cl100k_base")
        
        # Regex patterns
        self.section_patterns = [
            r'([IVX]+\.\s+[A-Z√úƒû≈û√á√ñI√ú][A-Z√úƒû≈û√á√ñI√úa-z√ºƒü≈ü√ß√∂ƒ±i\s]+)',  # Roma rakamlarƒ±
            r'([A-Z]\.\s+[A-Z√úƒû≈û√á√ñI√ú][A-Z√úƒû≈û√á√ñI√úa-z√ºƒü≈ü√ß√∂ƒ±i\s]*)',   # A. B. C. b√∂l√ºmler
            r'(\d+\.\s*[A-Z√úƒû≈û√á√ñI√ú][A-Z√úƒû≈û√á√ñI√úa-z√ºƒü≈ü√ß√∂ƒ±i\s]*)',    # 1. 2. 3. b√∂l√ºmler
        ]
        
        self.date_pattern = r'\d{1,2}\.\d{1,2}\.\d{4}'
        self.case_number_pattern = r'\d{4}/\d+\s+[EK]\.|[EK]\.\s*,\s*\d{4}/\d+\s+[EK]\.'
        
        # T√ºrk√ße karakter d√ºzeltme haritasƒ±
        self.char_fixes = {
            '√Ñ¬∞': 'ƒ∞', '√Ö≈æ': '≈û', '√Ñ≈∏': 'ƒü', '√É¬º': '√º', '√É¬ß': '√ß', 
            '√Ñ¬±': 'ƒ±', '√É¬∂': '√∂', '√É': 'ƒ∞', '√É‚Ä°': '√á', '√É≈ì√É‚Ä°': '√ú√á',
            '√Ç': '', '√¢‚Ç¨': '', '√¢‚Ç¨≈ì': '"', '√¢‚Ç¨': '"'
        }
        
        # Hukuki terimler (b√∂l√ºm tespiti i√ßin)
        self.legal_sections = {
            'DAVA', 'CEVAP', 'MAHKEME', 'KARAR', 'TEMYIZ', 'ISTINAF', 
            'BOZMA', 'GEREK√áE', 'DEƒûERLENDIRME', 'SONU√á'
        }

    def count_tokens(self, text: str) -> int:
        """Token sayƒ±sƒ±nƒ± hesapla"""
        return len(self.encoding.encode(text))
    
    def fix_encoding(self, text: str) -> str:
        """T√ºrk√ße karakter encoding sorunlarƒ±nƒ± d√ºzelt"""
        for wrong, correct in self.char_fixes.items():
            text = text.replace(wrong, correct)
        return text
    
    def preprocess_text(self, text: str) -> str:
        """Metni √∂n i≈ülemden ge√ßir"""
        # Encoding d√ºzelt
        text = self.fix_encoding(text)
        
        # Gereksiz bo≈üluklarƒ± temizle
        text = re.sub(r'\s+', ' ', text)
        
        # Satƒ±r ba≈ülarƒ±ndaki bo≈üluklarƒ± temizle
        text = re.sub(r'^\s+', '', text, flags=re.MULTILINE)
        
        return text.strip()
    
    def extract_metadata_info(self, text: str) -> Dict:
        """Metinden metadata bilgilerini √ßƒ±kar"""
        # Tarihler
        dates = re.findall(self.date_pattern, text)
        
        # Dava numaralarƒ±
        case_numbers = re.findall(self.case_number_pattern, text)
        
        # Hukuki referanslar
        legal_refs = bool(re.search(r'\d+\s+sayƒ±lƒ±|HMK|HUMK|ƒ∞ƒ∞K|TCK', text))
        
        return {
            'dates': list(set(dates)),
            'case_numbers': list(set(case_numbers)),
            'has_legal_refs': legal_refs,
            'has_dates': len(dates) > 0
        }
    
    def detect_sections(self, text: str) -> List[Tuple[str, int, int]]:
        """Metindeki b√∂l√ºmleri tespit et"""
        sections = []
        
        for pattern in self.section_patterns:
            matches = list(re.finditer(pattern, text))
            for match in matches:
                section_title = match.group(1).strip()
                start = match.start()
                # Bir sonraki b√∂l√ºm√º bul
                end = len(text)
                
                # Sonraki e≈üle≈ümeyi bul
                next_match = None
                for next_pattern in self.section_patterns:
                    next_matches = list(re.finditer(next_pattern, text[match.end():]))
                    if next_matches:
                        if next_match is None or next_matches[0].start() < next_match:
                            next_match = next_matches[0].start() + match.end()
                
                if next_match:
                    end = next_match
                
                sections.append((section_title, start, end))
        
        # Sƒ±rala ve √ßakƒ±≈ümalarƒ± temizle
        sections = sorted(sections, key=lambda x: x[1])
        cleaned_sections = []
        
        for i, (title, start, end) in enumerate(sections):
            if i == 0:
                cleaned_sections.append((title, start, end))
            else:
                prev_end = cleaned_sections[-1][2]
                if start >= prev_end:
                    cleaned_sections.append((title, start, end))
                else:
                    # √áakƒ±≈üma varsa √∂nceki b√∂l√ºm√ºn sonunu g√ºncelle
                    cleaned_sections[-1] = (cleaned_sections[-1][0], cleaned_sections[-1][1], start)
                    cleaned_sections.append((title, start, end))
        
        return cleaned_sections
    
    def split_by_sentences(self, text: str, max_tokens: int) -> List[str]:
        """C√ºmleler bazƒ±nda metni b√∂l"""
        # T√ºrk√ße i√ßin c√ºmle sonu i≈üaretleri
        sentence_endings = r'[.!?]+(?=\s+[A-Z√úƒû≈û√á√ñI√ú]|\s*$)'
        sentences = re.split(sentence_endings, text)
        
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
                
            test_chunk = f"{current_chunk} {sentence}".strip()
            
            if self.count_tokens(test_chunk) > max_tokens and current_chunk:
                chunks.append(current_chunk)
                current_chunk = sentence
            else:
                current_chunk = test_chunk
        
        if current_chunk:
            chunks.append(current_chunk)
        
        return chunks
    
    def create_overlapping_chunks(self, chunks: List[str]) -> List[str]:
        """Chunk'lar arasƒ± √∂rt√º≈üme olu≈ütur"""
        if len(chunks) <= 1:
            return chunks
        
        overlapped_chunks = []
        
        for i, chunk in enumerate(chunks):
            if i == 0:
                # ƒ∞lk chunk
                overlapped_chunks.append(chunk)
            else:
                # √ñnceki chunk'ƒ±n sonundan bir kƒ±smƒ±nƒ± al
                prev_chunk = chunks[i-1]
                overlap_tokens = int(self.count_tokens(prev_chunk) * self.overlap_ratio)
                
                # Son c√ºmleleri al (yakla≈üƒ±k)
                prev_words = prev_chunk.split()
                overlap_words = prev_words[-overlap_tokens*2:] if len(prev_words) > overlap_tokens*2 else prev_words
                overlap_text = " ".join(overlap_words)
                
                overlapped_chunk = f"{overlap_text} {chunk}"
                overlapped_chunks.append(overlapped_chunk)
        
        return overlapped_chunks
    
    def chunk_document(self, document_id: str, location: str, raw_text: str) -> List[Dict]:
        """Ana chunking fonksiyonu"""
        # √ñn i≈ülem
        text = self.preprocess_text(raw_text)
        
        # Metadata bilgilerini √ßƒ±kar
        metadata_info = self.extract_metadata_info(text)
        
        # B√∂l√ºmleri tespit et
        sections = self.detect_sections(text)
        
        all_chunks = []
        
        if not sections:
            # B√∂l√ºm bulunamazsa t√ºm metni chunk'la
            chunks = self.split_by_sentences(text, self.max_tokens)
            chunks = self.create_overlapping_chunks(chunks)
            
            for i, chunk_text in enumerate(chunks):
                if self.count_tokens(chunk_text) >= self.min_tokens:
                    chunk_data = {
                        'text': chunk_text,
                        'metadata': ChunkMetadata(
                            document_id=document_id,
                            location=location,
                            section_type="FULL_DOCUMENT",
                            section_title="Tam Metin",
                            chunk_index=i,
                            total_chunks=len(chunks),
                            tokens=self.count_tokens(chunk_text),
                            characters=len(chunk_text),
                            **metadata_info
                        )
                    }
                    all_chunks.append(chunk_data)
        
        else:
            # B√∂l√ºm bazƒ±nda chunk'la
            for section_title, start, end in sections:
                section_text = text[start:end].strip()
                
                if not section_text:
                    continue
                
                # B√∂l√ºm tipini belirle
                section_type = "OTHER"
                for legal_term in self.legal_sections:
                    if legal_term in section_title.upper():
                        section_type = legal_term
                        break
                
                if self.count_tokens(section_text) <= self.max_tokens:
                    # K√º√ß√ºk b√∂l√ºm, aynen kullan
                    if self.count_tokens(section_text) >= self.min_tokens:
                        chunk_data = {
                            'text': section_text,
                            'metadata': ChunkMetadata(
                                document_id=document_id,
                                location=location,
                                section_type=section_type,
                                section_title=section_title,
                                chunk_index=0,
                                total_chunks=1,
                                tokens=self.count_tokens(section_text),
                                characters=len(section_text),
                                **metadata_info
                            )
                        }
                        all_chunks.append(chunk_data)
                else:
                    # B√ºy√ºk b√∂l√ºm, alt chunk'lara b√∂l
                    section_chunks = self.split_by_sentences(section_text, self.max_tokens)
                    section_chunks = self.create_overlapping_chunks(section_chunks)
                    
                    for i, chunk_text in enumerate(section_chunks):
                        if self.count_tokens(chunk_text) >= self.min_tokens:
                            chunk_data = {
                                'text': chunk_text,
                                'metadata': ChunkMetadata(
                                    document_id=document_id,
                                    location=location,
                                    section_type=section_type,
                                    section_title=section_title,
                                    chunk_index=i,
                                    total_chunks=len(section_chunks),
                                    tokens=self.count_tokens(chunk_text),
                                    characters=len(chunk_text),
                                    **metadata_info
                                )
                            }
                            all_chunks.append(chunk_data)
        
        return all_chunks

def process_legal_csv(csv_path: str, output_path: str, 
                     target_tokens: int = 500, 
                     max_tokens: int = 800) -> None:
    """CSV dosyasƒ±nƒ± i≈üle ve chunk'larƒ± kaydet"""
    
    print(f"üìñ CSV dosyasƒ± okunuyor: {csv_path}")
    df = pd.read_csv(csv_path)
    print(f"‚úÖ {len(df)} belge bulundu")
    
    # Chunker'ƒ± ba≈ülat
    chunker = TurkishLegalChunker(target_tokens=target_tokens, max_tokens=max_tokens)
    
    all_chunks = []
    
    # Progress tracking
    total_docs = len(df)
    
    for idx, row in df.iterrows():
        if idx % 100 == 0:
            print(f"üîÑ ƒ∞≈ülenen: {idx}/{total_docs} (%{idx/total_docs*100:.1f})")
        
        try:
            chunks = chunker.chunk_document(
                document_id=row['_id'],
                location=row['location'],
                raw_text=row['rawText']
            )
            
            # DataFrame i√ßin d√ºzle≈ütir
            for chunk in chunks:
                chunk_row = {
                    'chunk_id': f"{chunk['metadata'].document_id}_{chunk['metadata'].chunk_index}",
                    'document_id': chunk['metadata'].document_id,
                    'location': chunk['metadata'].location,
                    'section_type': chunk['metadata'].section_type,
                    'section_title': chunk['metadata'].section_title,
                    'chunk_index': chunk['metadata'].chunk_index,
                    'total_chunks': chunk['metadata'].total_chunks,
                    'text': chunk['text'],
                    'tokens': chunk['metadata'].tokens,
                    'characters': chunk['metadata'].characters,
                    'has_dates': chunk['metadata'].has_dates,
                    'has_legal_refs': chunk['metadata'].has_legal_refs,
                    'dates': json.dumps(chunk['metadata'].dates),
                    'case_numbers': json.dumps(chunk['metadata'].case_numbers)
                }
                all_chunks.append(chunk_row)
                
        except Exception as e:
            print(f"‚ùå Hata - Belge {row['_id']}: {str(e)}")
            continue
    
    print(f"‚úÖ Toplam {len(all_chunks)} chunk olu≈üturuldu")
    
    # Sonu√ßlarƒ± kaydet
    chunks_df = pd.DataFrame(all_chunks)
    chunks_df.to_csv(output_path, index=False, encoding='utf-8')
    
    # ƒ∞statistikler
    print("\nüìä ƒ∞statistikler:")
    print(f"Ortalama chunk boyutu: {chunks_df['tokens'].mean():.1f} token")
    print(f"Medyan chunk boyutu: {chunks_df['tokens'].median():.1f} token")
    print(f"Min chunk boyutu: {chunks_df['tokens'].min()} token")
    print(f"Max chunk boyutu: {chunks_df['tokens'].max()} token")
    print(f"Tarih i√ßeren chunk'lar: {chunks_df['has_dates'].sum()}")
    print(f"Hukuki referans i√ßeren chunk'lar: {chunks_df['has_legal_refs'].sum()}")
    
    # B√∂l√ºm t√ºrleri
    print(f"\nüè∑Ô∏è B√∂l√ºm t√ºrleri:")
    print(chunks_df['section_type'].value_counts())
    
    print(f"\nüíæ Sonu√ßlar kaydedildi: {output_path}")

# √ñrnek kullanƒ±m
if __name__ == "__main__":
    # CSV'yi i≈üle
    process_legal_csv(
        csv_path='/home/yapayzeka/ahsen_bulbul/data/10data.csv',
        output_path='2legal_chunks.csv',
        target_tokens=500,
        max_tokens=800
    )
    
    print("üéâ ƒ∞≈ülem tamamlandƒ±!")
    
    # Sonu√ßlarƒ± kontrol et
    df = pd.read_csv('2legal_chunks.csv')
    print(f"\nüìã √ñrnek chunk'lar:")
    print(df[['chunk_id', 'section_type', 'tokens', 'text']].head().to_string())

üìñ CSV dosyasƒ± okunuyor: /home/yapayzeka/ahsen_bulbul/data/10data.csv
‚úÖ 20 belge bulundu
üîÑ ƒ∞≈ülenen: 0/20 (%0.0)
‚úÖ Toplam 132 chunk olu≈üturuldu

üìä ƒ∞statistikler:
Ortalama chunk boyutu: 368.5 token
Medyan chunk boyutu: 252.5 token
Min chunk boyutu: 101 token
Max chunk boyutu: 1224 token
Tarih i√ßeren chunk'lar: 132
Hukuki referans i√ßeren chunk'lar: 129

üè∑Ô∏è B√∂l√ºm t√ºrleri:
section_type
DAVA             57
OTHER            38
TEMYIZ           10
KARAR             8
GEREK√áE           7
MAHKEME           7
BOZMA             4
DEƒûERLENDIRME     1
Name: count, dtype: int64

üíæ Sonu√ßlar kaydedildi: 2legal_chunks.csv
üéâ ƒ∞≈ülem tamamlandƒ±!

üìã √ñrnek chunk'lar:
                     chunk_id section_type  tokens                                                                                                                                                                                                                                                               

"qdrant_api_key": "kMy0juEwUcsLjKDjWTPUAWTYlYpR3kjh"
"qdrant_client": "https://qdrant.adalet.gov.tr:443"