In [47]:
import os 
from openai import OpenAI
from dotenv import load_dotenv
import json

load_dotenv(override=True)

api_key = os.getenv("DEEPSEEK_API_KEY")
model_name = "deepseek-chat"

def get_completion(messages, model_name=model_name):
    client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.3, 
        max_tokens=8192, 
    )
    return response.choices[0].message.content

def get_completion_json_output(messages, model_name=model_name):
    client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.3, 
        max_tokens=8192, 
        response_format={"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)

def get_completion_openai(messages):
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.3, 
        max_tokens=8192, 
    )
    return response.choices[0].message.content

In [31]:
from typing import Dict
import os 

class MarkdownParser:
    def __init__(self, max_header_level: int = 2):
        """
        Args:
            max_header_level: 별도 섹션으로 분리할 최대 헤더 레벨 (기본값: 2, ## 까지만 분리)
        """
        self.max_header_level = max_header_level
    

    def parse_markdown(self, markdown_text: str) -> Dict[str, str]:
        """마크다운 텍스트를 파싱하여 섹션별로 분리"""
        sections = {}
        current_content = []
        current_title = None
        
        for line in markdown_text.split('\n'):
            if line.startswith('#') and ' ' in line:
                # 헤더 레벨 확인
                level = len(line) - len(line.lstrip('#'))
                
                # max_header_level 이하의 헤더만 새로운 섹션으로 처리
                if level <= self.max_header_level:
                    # 이전 섹션 저장
                    if current_title and current_content:
                        sections[current_title] = '\n'.join(current_content).strip()
                    
                    # 새로운 섹션 시작
                    current_title = line.lstrip('#').strip()
                    current_content = []
                else:
                    # 상위 레벨 헤더는 내용에 포함
                    current_content.append(line)
            else:
                if current_title is None:
                    continue  # 첫 헤더 이전의 내용은 무시
                current_content.append(line)
        
        # 마지막 섹션 저장
        if current_title and current_content:
            sections[current_title] = '\n'.join(current_content).strip()
        
        return sections
    
    def save_markdown(self, file_name: str, markdown_text: str, output_dir: str = "markdown_input") -> None:
        """
        마크다운 텍스트를 파일로 저장
        """
        with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as f:
            f.write(markdown_text)
        print(f"Markdown saved to: {os.path.join(output_dir, file_name)}")

In [30]:
with open("input_files/Curating Custom Datasets for LLM Training with NVIDIA NeMo Curator.md", "r", encoding="utf-8") as file:
    markdown_text = file.read()
    markdown_parser = MarkdownParser()
    sections = markdown_parser.parse_markdown(markdown_text)

    for title, content in sections.items():
        print(f"Title: {title}\n")
        print(f"Content: {content}\n")
        print("-" * 80)


Title: Curating Custom Datasets for LLM Training with NVIDIA NeMo Curator

Content: Data curation is the first, and arguably the most important, step in the pretraining and continuous training of [large language models (LLMs)](https://www.nvidia.com/en-us/glossary/large-language-models/) and small language models (SLMs). NVIDIA recently announced the open-source release of [NVIDIA NeMo Curator](https://developer.nvidia.com/blog/scale-and-curate-high-quality-datasets-for-llm-training-with-nemo-curator/), a data curation framework that prepares large-scale, high-quality datasets for pretraining generative AI models. 

NeMo Curator, which is part of [NVIDIA NeMo](https://www.nvidia.com/en-us/ai-data-science/products/nemo/), offers workflows to download and curate data from various public sources out of the box such as Common Crawl, Wikipedia, and arXiv. It also provides flexibility for developers to customize data curation pipelines to address their unique requirements and create custom d

In [40]:
import time
from tqdm import tqdm
from typing import Dict

class TextExplainer:
    def __init__(self):
        self.conversation_history = []
        self.delay = 1
        
    def explain_section(self, section_title: str, section_content: str, is_first: bool = False) -> str:
        """
        섹션의 내용을 설명하는 함수. Deepseek API 실패시 OpenAI API로 fallback
        
        Args:
            section_title: 섹션 제목
            section_content: 섹션 내용
            is_first: 첫 번째 섹션인지 여부
            
        Returns:
            str: 섹션에 대한 설명
        """
        if is_first:
            prompt = f"""다음 섹션 '{section_title}'의 내용을 명확하고 자세하게 설명해주세요.
            
            섹션 내용:
            {section_content}"""
        else:
            prompt = f"""이전 섹션에 했던 설명들을 추가 맥락으로 참고해서 다음 섹션 '{section_title}'의 내용을 설명해주세요.
            
            섹션 내용:
            {section_content}"""
        
        # 이전 대화 내용을 포함하여 컨텍스트 유지
        self.conversation_history.append({"role": "user", "content": prompt})
        
        # Deepseek API 첫 시도
        try:
            print("Attempting Deepseek API...")
            response = get_completion(self.conversation_history, model_name="deepseek-reasoner")
            
            if not response:
                raise ValueError("Empty response from Deepseek API")
                
            # 대화 히스토리 업데이트
            self.conversation_history.append({"role": "assistant", "content": response})
            return response
            
        except Exception as e:
            print(f"Deepseek API attempt failed: {str(e)}")
            print(f"Retrying Deepseek API in {self.delay} seconds...")
            time.sleep(self.delay)
            
            # Deepseek API 재시도
            try:
                print("Retrying Deepseek API...")
                response = get_completion(self.conversation_history, model_name="deepseek-reasoner")
                
                if not response:
                    raise ValueError("Empty response from Deepseek API")
                    
                # 대화 히스토리 업데이트
                self.conversation_history.append({"role": "assistant", "content": response})
                return response
                
            except Exception as retry_error:
                print(f"Deepseek API retry also failed: {str(retry_error)}")
                print("Falling back to OpenAI API...")
                
                # OpenAI API로 fallback
                try:
                    response = get_completion_openai(self.conversation_history)
                    if not response:
                        raise ValueError("Empty response from OpenAI API")
                        
                    # 대화 히스토리 업데이트
                    self.conversation_history.append({"role": "assistant", "content": response})
                    return response
                    
                except Exception as openai_error:
                    print(f"OpenAI API fallback also failed: {str(openai_error)}")
                    return f"Error: Failed to explain section {section_title} with both APIs"
                
    def explain_text(self, sections: Dict[str, str]) -> Dict[str, str]:
        """
        텍스트의 각 섹션을 순차적으로 설명
        
        Args:
            sections: 섹션 제목과 내용을 매핑한 딕셔너리
            
        Returns:
            섹션 제목과 설명을 매핑한 딕셔너리
        """
        explanations = {}
        
        print("\nProcessing sections:")
        for i, (title, content) in tqdm(enumerate(sections.items()), desc="Explaining sections"):
            print(f"\nProcessing: {title}")
            explanation = self.explain_section(title, content, is_first=(i==0))
            explanations[title] = explanation
            
        return explanations
    
    def get_conversation_history(self):
        """대화 히스토리 반환"""
        return self.conversation_history
    
    def save_explanations(self, explanations: Dict[str, str], file_name: str, output_dir: str = "output_files") -> str:
        """설명을 파일로 저장"""
        with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as f:
            for title, explanation in explanations.items():
                f.write(f"## {title}\n\n{explanation}")
                f.write("\n\n---\n")
        print(f"Explanations saved to: {os.path.join(output_dir, file_name)}")
        
        return os.path.join(output_dir, file_name)

In [37]:
from typing import Optional, List, Dict
import time

class TextQA:
    def __init__(self, context: Optional[List[Dict[str, str]]] = None):
        self.conversation_history = context or []
        self.delay = 1
        
    def ask_question(self, question: str) -> str:
        """텍스트에 대한 질문에 답변"""
        try:
            prompt = f"""Based on the text we discussed, please answer the following question in Korean. 
            Be specific and cite relevant sections when possible.

            Question: {question}"""
            
            self.conversation_history.append({"role": "user", "content": prompt})
            response = get_completion(self.conversation_history, model_name="deepseek-reasoner")
            self.conversation_history.append({"role": "assistant", "content": response})
            
            time.sleep(self.delay)
            return response
            
        except Exception as e:
            print(f"Error processing question: {str(e)}")
            return f"Error: Failed to process question"
    
    def view_conversation_history(self, start_idx: int = 0, end_idx: Optional[int] = None) -> None:
        """대화 내역을 출력
        
        Args:
            start_idx: 시작 인덱스 (기본값: 0)
            end_idx: 종료 인덱스 (기본값: None, None일 경우 끝까지 출력)
        """
        conversations = [
            msg for msg in self.conversation_history 
            if not msg["content"].startswith("Here's the text summary:")
        ]
        
        end_idx = end_idx if end_idx is not None else len(conversations)
        
        print("\n=== 대화 내역 ===\n")
        for i, msg in enumerate(conversations[start_idx:end_idx], start=start_idx):
            role = msg["role"].upper()
            if role == "ASSISTANT":
                print(f"\n🤖 Assistant ({i}):\n{msg['content']}\n")
                print("-" * 80)
            elif role == "USER":
                print(f"\n👤 User ({i}):\n{msg['content']}\n")
                print("-" * 80)
    
    def get_last_n_conversations(self, n: int = 1) -> None:
        """최근 n개의 대화 내역을 출력
        
        Args:
            n: 출력할 최근 대화 개수 (기본값: 1)
        """
        conversations = [
            msg for msg in self.conversation_history 
            if not msg["content"].startswith("Here's the text summary:")
        ]
        start_idx = max(0, len(conversations) - n)
        self.view_conversation_history(start_idx)
        
    def get_conversation_history(self) -> List[Dict[str, str]]:
        """전체 대화 기록 반환"""
        return self.conversation_history

In [32]:
from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
from rich.syntax import Syntax
from rich.table import Table
from typing import Dict
import os

class MarkdownPrinter:
    def __init__(self):
        self.console = Console()
        
    def print_markdown_file(self, file_path: str):
        """마크다운 파일을 이쁘게 출력"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                markdown_content = f.read()
            
            # 마크다운 렌더링
            md = Markdown(markdown_content)
            
            # 마크다운 내용 출력
            self.console.print(md)
            
        except Exception as e:
            self.console.print(f"[bold red]Error reading file: {str(e)}[/]")
            
    def print_sections(self, sections: Dict[str, str]):
        """섹션별로 구분하여 출력"""
        for section, content in sections.items():
            # 섹션 제목
            self.console.print("\n")
            self.console.print(Panel(
                f"[bold cyan]{section}[/]",
                border_style="cyan"
            ))
            
            # 섹션 내용
            md = Markdown(content)
            self.console.print(md)
            
            # 구분선
            self.console.print("[dim]" + "="*80 + "[/]")

In [44]:

def process_text(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as file:
        file_name = os.path.basename(file_path)
        markdown_text = file.read()
        markdown_parser = MarkdownParser()
        sections = markdown_parser.parse_markdown(markdown_text)
        
        explainer = TextExplainer()
        explanations = explainer.explain_text(sections)
        explanation_path = explainer.save_explanations(explanations, file_name)
        
        qa = TextQA(context=explainer.get_conversation_history())
    
    return explanation_path, qa

In [45]:
explanation_path, qa = process_text("input_files/Curating Custom Datasets for LLM Training with NVIDIA NeMo Curator.md")

markdown_printer = MarkdownPrinter()
markdown_printer.print_markdown_file(explanation_path)


Processing sections:


Explaining sections: 0it [00:00, ?it/s]


Processing: Curating Custom Datasets for LLM Training with NVIDIA NeMo Curator
Attempting Deepseek API...


Explaining sections: 1it [01:35, 95.44s/it]


Processing: Overview[](#overview)
Attempting Deepseek API...
Deepseek API attempt failed: Expecting value: line 1 column 1 (char 0)
Retrying Deepseek API in 1 seconds...
Retrying Deepseek API...
Deepseek API retry also failed: Expecting value: line 1 column 1 (char 0)
Falling back to OpenAI API...


Explaining sections: 2it [03:37, 111.03s/it]

OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: Prerequisite[](#prerequisite)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek API in 1 seconds...
Retrying Deepseek API...
Deepseek API retry also failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error

Explaining sections: 3it [03:38, 61.05s/it] 

OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: Defining custom document builders[](#defining_custom_document_builders)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek API in 1 seconds...
Retrying Deepseek API...
Deepseek API retry also failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'pa

Explaining sections: 4it [03:40, 37.53s/it]

OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: Downloading the TinyStories dataset[](#downloading_the_tinystories_dataset)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek API in 1 seconds...
Retrying Deepseek API...
Deepseek API retry also failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error',

Explaining sections: 5it [03:42, 24.57s/it]

OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: Text cleaning and unification[](#text_cleaning_and_unification)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek API in 1 seconds...
Retrying Deepseek API...


Explaining sections: 6it [03:43, 16.83s/it]

Deepseek API retry also failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Falling back to OpenAI API...
OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: Dataset filtering[](#dataset_filtering)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek A

Explaining sections: 7it [03:45, 11.93s/it]

Deepseek API retry also failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Falling back to OpenAI API...
OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: Deduplication[](#deduplication)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek API in 1 

Explaining sections: 8it [03:47,  8.67s/it]

OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: PII redaction[](#pii_redaction)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek API in 1 seconds...
Retrying Deepseek API...
Deepseek API retry also failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_err

Explaining sections: 9it [03:48,  6.47s/it]

OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: Putting the curation pipeline together[](#putting_the_curation_pipeline_together)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek API in 1 seconds...
Retrying Deepseek API...
Deepseek API retry also failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_e

Explaining sections: 10it [03:50,  5.08s/it]

OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}

Processing: Next steps[](#next_steps)
Attempting Deepseek API...
Deepseek API attempt failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Retrying Deepseek API in 1 seconds...
Retrying Deepseek API...
Deepseek API retry also failed: Error code: 400 - {'error': {'message': 'deepseek-reasoner does not support successive user or assistant messages (messages[2] and messages[3] in your input). You should interleave the user/assistant messages in the message sequence.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}


Explaining sections: 11it [03:52, 21.16s/it]

OpenAI API fallback also failed: Error code: 400 - {'error': {'message': 'Model Not Exist', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_request_error'}}
Explanations saved to: output_files/Curating Custom Datasets for LLM Training with NVIDIA NeMo Curator.md





In [22]:
question = """ 
이제 다른 질문이야. 

글의 내용을 보면 분류기(Classifier) 또는 휴리스틱 기반 필터를 사용해 고품질 텍스트만 선별한다고 하고 참고 문서는 다음과 같다고 말했어. 

Scaling Language Models: Methods, Analysis & Insights from Training Gopher

이 논문에 대해 안다면 어떤 매커니즘으로 고품질 데이터를 필터링하는건지 설명해줘.
"""

response = qa.ask_question(question)
print(response)

**"Scaling Language Models: Methods, Analysis & Insights from Training Gopher"** 논문에서 설명된 **고품질 데이터 필터링 메커니즘**에 대해 설명드리겠습니다. 이 논문은 대규모 언어 모델 훈련을 위한 데이터 품질 관리 방법을 다루며, NeMo Data Curator에서도 유사한 접근 방식을 사용합니다([관련 섹션](#document-level_quality_filtering)).

---

### **1. 고품질 데이터 필터링의 필요성**
- **문제점**:  
  웹 크롤링 데이터(예: Common Crawl)에는 **저품질 콘텐츠**(예: 반복 문자열, 무의미한 텍스트, 상용구)가 다량 포함되어 있습니다.  
  - 이러한 데이터는 모델의 **일반화 능력**과 **다운스트림 태스크 성능**을 저하시킬 수 있습니다.  

- **해결책**:  
  - **분류기(Classifier)**와 **휴리스틱 기반 필터**를 사용해 고품질 텍스트만 선별합니다.  

---

### **2. 필터링 메커니즘**
#### **(1) 휴리스틱 기반 필터**
- **목적**:  
  간단한 규칙을 사용해 저품질 텍스트를 빠르게 식별합니다.  
- **주요 규칙**:  
  - **문장 길이**: 너무 짧거나 긴 문장을 제거합니다.  
  - **특수 문자 비율**: 특수 문자(예: URL, 기호)가 과도하게 포함된 텍스트를 제거합니다.  
  - **반복 문자열**: 동일한 단어나 구가 반복되는 텍스트를 제거합니다.  
  - **언어 감지**: 목표 언어와 일치하지 않는 텍스트를 제거합니다.  

- **장점**:  
  - 계산 비용이 낮고, 대규모 데이터셋에서 빠르게 적용 가능합니다.  

#### **(2) 분류기(Classifier) 기반 필터**
- **목적**:  
  더 정교한 방식으로 텍스트의 품질을 평가합니다.  
- **학습 데이터**:  
  - 고품질 텍스트(예: 위키피디아, 전문 도메인

In [21]:
qa.get_last_n_conversations(6)


=== 대화 내역 ===


👤 User (14):
Based on the text we discussed, please answer the following question in Korean. 
            Be specific and cite relevant sections when possible.

            Question:  
MinHashLSM 알고리즘의 작동 원리에 대해 구체적으로 알려줘. 


--------------------------------------------------------------------------------

🤖 Assistant (15):
**MinHashLSH(Locality-Sensitive Hashing) 알고리즘의 작동 원리**를 구체적으로 설명드리겠습니다. 이 알고리즘은 대규모 데이터셋에서 **유사한 문서를 효율적으로 찾기 위해 설계**되었으며, NeMo Data Curator에서도 유사 중복 제거에 활용되었습니다([관련 섹션](#document-level_deduplication)).

---

### **1. MinHashLSH의 기본 개념**
- **목적**:  
  두 문서 간의 **Jaccard 유사도**를 빠르게 추정하고, 유사한 문서를 그룹화합니다.  
  - **Jaccard 유사도**: 두 집합의 교집합 크기를 합집합 크기로 나눈 값.  
    예: 문서 A와 B의 단어 집합이 각각 {a, b, c}와 {a, b, d}라면, Jaccard 유사도는 2/4 = 0.5입니다.  

- **핵심 아이디어**:  
  - 문서를 **해시 값으로 변환**해 유사성을 빠르게 계산합니다.  
  - **LSH(Locality-Sensitive Hashing)**: 유사한 문서를 같은 버킷에 그룹화해, 전체 문서 쌍을 비교하지 않고도 유사성을 판단합니다.  

---

### **2. MinHashLSH의 작동 원리**
#### **(1) MinHash 계산**
1. **문서를 집