In [1]:
%pip install anthropic
%pip install rich
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
from openai import OpenAI
from dotenv import load_dotenv
import os
from typing import Dict, List

load_dotenv()

# api_key = os.getenv("OPENAI_API_KEY")
api_key = os.getenv("DEEPSEEK_API_KEY")
client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")

model_name = "deepseek-reasoner"
# model_name = "o1-preview-2024-09-12"

def get_completion(messages: List[Dict[str, str]]) -> str:
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        max_completion_tokens=25000, 
    )
    return response.choices[0].message.content

In [7]:
# api calling test 

get_completion([{"role": "user", "content": "Hello, how are you?"}])

"Hello! I'm just a program, so I don't have feelings, but I'm here and ready to help with whatever you need. How can I assist you today?"

In [8]:
import re
from typing import Dict, List
from pathlib import Path

class MarkdownSectionParser:
    def __init__(self):
        self.section_pattern = r'^#+ .*$'  # '#'으로 시작하는 헤더 패턴
        self.end_sections = {'ACKNOWLEDGEMENTS', 'REFERENCES', 'CONCLUSION', 'CONCLUSIONS'}
        
    def parse_sections(self, markdown_path: str) -> Dict[str, str]:
        sections = {}
        current_section = None
        current_content = []
        
        with open(markdown_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        for line in lines:
            if re.match(self.section_pattern, line):
                section_title = line.strip('# \n')
                
                if current_section and current_content:
                    sections[current_section] = ''.join(current_content).strip()
                
                # 대문자로 변환하여 비교
                if any(end_sec in section_title.upper() for end_sec in self.end_sections):
                    break
                    
                current_section = section_title
                current_content = []
            else:
                if current_section is None and 'ABSTRACT' not in line.upper():
                    continue
                current_content.append(line)
        
        # 마지막 섹션 저장
        if current_section and current_content:
            sections[current_section] = ''.join(current_content).strip()
        
        # Abstract 필터링
        filtered_sections = {}
        include_section = False
        
        for section, content in sections.items():
            if 'ABSTRACT' in section.upper():
                include_section = True
            
            if include_section:
                filtered_sections[section] = content
                
        return filtered_sections

In [18]:
from anthropic import Anthropic
from typing import Dict, List
import time
from tqdm import tqdm

class PaperExplainer:
    def __init__(self):
        self.conversation_history = []
        self.delay = 1  # API 호출 간 딜레이 (초)
        
    def _create_section_prompt(self, section_title: str, section_content: str, is_first: bool = False) -> str:
        if is_first:
            return f"""You are an expert academic paper explainer. Please explain the following section '{section_title}' 
            from an academic paper in a clear and concise manner. Please explain in Korean.

            Section content:
            {section_content}"""
        else:
            return f"""Please explain the following section '{section_title}'.
            
            Section content:
            {section_content}"""
    
    def explain_section(self, section_title: str, section_content: str) -> str:
        try:
            # Create prompt based on whether this is the first section
            is_first = len(self.conversation_history) == 0
            prompt = self._create_section_prompt(section_title, section_content, is_first)
            
            # Add previous conversation for context
            self.conversation_history.append({"role": "user", "content": prompt})
            
            response = get_completion(self.conversation_history)
            
            # Update conversation history
            self.conversation_history.append({"role": "assistant", "content": response})            
            # API 호출 간 딜레이
            time.sleep(self.delay)
            
            return response
            
        except Exception as e:
            print(f"Error explaining section {section_title}: {str(e)}")
            return f"Error: Failed to explain section {section_title}"

    def explain_paper(self, sections: Dict[str, str]) -> Dict[str, str]:
        """
        논문의 각 섹션을 순차적으로 설명
        
        Args:
            sections: 섹션 제목과 내용을 매핑한 딕셔너리
            
        Returns:
            섹션 제목과 설명을 매핑한 딕셔너리
        """
        explanations = {}
        
        print("\nProcessing sections:")
        for title, content in tqdm(sections.items(), desc="Explaining sections"):
            print(f"\nProcessing: {title}")
            explanation = self.explain_section(title, content)
            explanations[title] = explanation
            
        return explanations
    
    def get_conversation_history(self):
        return self.conversation_history

In [13]:

from typing import Optional, List

class PaperQA:
    def __init__(self, context: Optional[List[Dict[str, str]]] = None):
        self.conversation_history = context or []
        self.delay = 1
        
    def load_paper_context(self, explanations: Dict[str, str]):
        """논문 설명을 대화 기록에 로드"""
        context = "Here's the paper summary:\n\n"
        for section, explanation in explanations.items():
            context += f"## {section}\n{explanation}\n\n"
            
        # 논문 컨텍스트를 대화 기록에 추가
        self.conversation_history.append({"role": "user", "content": context})

    def ask_question(self, question: str) -> str:
        """논문에 대한 질문에 답변"""
        try:
            # 질문 프롬프트 생성
            prompt = f"""Based on the paper we discussed, please answer the following question in Korean. 
            Be specific and cite relevant sections when possible.

            Question: {question}"""
            
            # 이전 대화 기록과 함께 질문 전송
            self.conversation_history.append({"role": "user", "content": prompt})
            
            # Claude에 질문
            response = get_completion(self.conversation_history)
            
            # 대화 기록 업데이트
            self.conversation_history.append({"role": "assistant", "content": response})
            
            time.sleep(self.delay)
            return response
            
        except Exception as e:
            print(f"Error processing question: {str(e)}")
            return f"Error: Failed to process question"
    
    def view_conversation_history(self, start_idx: int = 0, end_idx: Optional[int] = None) -> None:
        """대화 내역을 출력하는 함수
        
        Args:
            start_idx: 시작 인덱스 (기본값: 0)
            end_idx: 종료 인덱스 (기본값: None, None일 경우 끝까지 출력)
        """
        # 논문 컨텍스트는 제외하고 실제 대화만 출력
        conversations = [
            msg for msg in self.conversation_history 
            if not msg["content"].startswith("Here's the paper summary:")
        ]
        
        # end_idx가 None이면 리스트 끝까지
        end_idx = end_idx if end_idx is not None else len(conversations)
        
        print("\n=== 대화 내역 ===\n")
        for i, msg in enumerate(conversations[start_idx:end_idx], start=start_idx):
            role = msg["role"].upper()
            if role == "ASSISTANT":
                print(f"\n🤖 Assistant ({i}):\n{msg['content']}\n")
                print("-" * 80)
            elif role == "USER":
                print(f"\n👤 User ({i}):\n{msg['content']}\n")
                print("-" * 80)
    
    
    def get_last_n_conversations(self, n: int = 1) -> None:
        """최근 n개의 대화 내역을 출력
        
        Args:
            n: 출력할 최근 대화 개수 (기본값: 1)
        """
        conversations = [
            msg for msg in self.conversation_history 
            if not msg["content"].startswith("Here's the paper summary:")
        ]
        start_idx = max(0, len(conversations) - n)
        self.view_conversation_history(start_idx)
        
    def get_conversation_history(self):
        return self.conversation_history


In [22]:
from typing import Tuple
import json

def process_paper(markdown_path: str) -> Tuple[Dict[str, str], PaperQA]:
    """
    전체 논문 처리 프로세스
    """
    # 1. Markdown 파싱
    parser = MarkdownSectionParser()
    sections = parser.parse_sections(markdown_path)
    
    # 2. 섹션별 설명 생성
    explainer = PaperExplainer()
    explanations = explainer.explain_paper(sections)

    # 3. 대화 기록 파일에 저장 
    conversation_history = explainer.get_conversation_history()
    with open(f"data/explanation_data_by_{model_name}.jsonl", "a") as f:
        data = json.dumps({"messages": conversation_history}, ensure_ascii=False)
        f.write(data + "\n")
    
    # 4. 질문 답변 준비 
    qa = PaperQA(context=conversation_history)
    
    return explanations, qa

In [24]:
from dotenv import load_dotenv
import os

load_dotenv()

markdown_path = "input_file/WHAT MAKES GOOD DATA FOR ALIGNMENT? A COMPREHENSIVE STUDY OF AUTOMATIC DATA SELECTION IN INSTRUCTION TUNING.md"
output_dir = "output_file"

explanations, qa = process_paper(markdown_path)


Processing sections:


Explaining sections:   0%|          | 0/11 [00:00<?, ?it/s]


Processing: ABSTRACT


Explaining sections:   9%|▉         | 1/11 [00:17<02:58, 17.88s/it]


Processing: 1 INTRODUCTION


Explaining sections:  18%|█▊        | 2/11 [00:51<04:02, 27.00s/it]


Processing: 2 WHAT MAKES GOOD DATA FOR ALIGNMENT?


Explaining sections:  27%|██▋       | 3/11 [01:20<03:44, 28.06s/it]


Processing: 2.1 THE DATA SELECTION PROBLEM


Explaining sections:  36%|███▋      | 4/11 [01:48<03:15, 27.96s/it]


Processing: 2.2 EXPERIMENTAL SETUP


Explaining sections:  45%|████▌     | 5/11 [02:16<02:47, 27.97s/it]


Processing: 2.4 FROM THE QUALITY PERSPECTIVE – EVOL QUALITY


Explaining sections:  55%|█████▍    | 6/11 [02:49<02:28, 29.67s/it]


Processing: 3 DEITA– DATA EFFICIENT INSTRUCTION TUNING FOR ALIGNMENT


Explaining sections:  64%|██████▎   | 7/11 [03:23<02:04, 31.05s/it]


Processing: 3.1 METHOD


Explaining sections:  73%|███████▎  | 8/11 [03:45<01:24, 28.27s/it]


Processing: Algorithm 1 Score-First, Diversity-Aware Data Selection


Explaining sections:  82%|████████▏ | 9/11 [04:15<00:57, 28.71s/it]


Processing: 3.2 EXPERIMENTAL SETUP


Explaining sections:  91%|█████████ | 10/11 [04:39<00:27, 27.42s/it]


Processing: 3.3 RESULTS


Explaining sections: 100%|██████████| 11/11 [05:15<00:00, 28.70s/it]


In [25]:
output_dir = "output_file"

input_filename = Path(markdown_path).stem  # 파일 이름만 추출 (확장자 제외)
output_path = os.path.join(output_dir, f"{input_filename}_explained.md")

with open(output_path, 'w', encoding='utf-8') as f:
    for section, explanation in explanations.items():
        f.write(f"\n## {section}\n\n")
        f.write(explanation)
        f.write("\n\n---\n")

print(f"\nExplanations saved to: {output_path}")


Explanations saved to: output_file/WHAT MAKES GOOD DATA FOR ALIGNMENT? A COMPREHENSIVE STUDY OF AUTOMATIC DATA SELECTION IN INSTRUCTION TUNING_explained.md


In [26]:
from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
from rich.syntax import Syntax
from rich.table import Table
from typing import Dict
import os

class MarkdownPrinter:
    def __init__(self):
        self.console = Console()
        
    def print_markdown_file(self, file_path: str):
        """마크다운 파일을 이쁘게 출력"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                markdown_content = f.read()
            
            # 마크다운 렌더링
            md = Markdown(markdown_content)
            
            # 마크다운 내용 출력
            self.console.print(md)
            
        except Exception as e:
            self.console.print(f"[bold red]Error reading file: {str(e)}[/]")
            
    def print_sections(self, sections: Dict[str, str]):
        """섹션별로 구분하여 출력"""
        for section, content in sections.items():
            # 섹션 제목
            self.console.print("\n")
            self.console.print(Panel(
                f"[bold cyan]{section}[/]",
                border_style="cyan"
            ))
            
            # 섹션 내용
            md = Markdown(content)
            self.console.print(md)
            
            # 구분선
            self.console.print("[dim]" + "="*80 + "[/]")

printer = MarkdownPrinter()

# 마크다운 파일 출력
printer.print_markdown_file(output_path)

In [29]:
question = """ 
데이터를 평가하기 위해 ChatGPT 를 이용해서 순위화를 한다고 하는데 그럼 여러개의 데이터를 한번에 생성하고 순위가 높은 것들을 고품질 데이터셋에 추가하는 식으로 데이터가 생성되는거? 
"""

response = qa.ask_question(question)

print(response)

### **ChatGPT를 활용한 데이터 순위화 및 선별 과정**  
본 논문(§2.3-2.4, Algorithm 1)에서는 **단일 데이터 샘플을 변형해 다중 예시를 생성**한 후, ChatGPT로 순위를 매겨 고품질 데이터를 선별합니다. 구체적 단계는 다음과 같습니다:

---

### **1. 단일 데이터 진화 및 다중 변형 생성**  
1. **진화(Evolve)**:  
   - 기존 데이터 샘플 $(I_k, R_k)$을 **EVOL-COMPLEXITY** 또는 **EVOL-QUALITY** 프롬프트로 변형합니다.  
   - *예*: 원본 지시문 "고양이 설명" → "고양이 품종의 유전적 특성 설명" 등 **5회 변형**해 총 6개 샘플 생성.  

2. **변형 예시 예시**:  
   - 복잡성 진화: 지시문에 **추론 단계**, **전문 용어**, **제약 조건** 추가.  
   - 품질 진화: 응답의 **정확성**, **상세성**, **유용성** 개선.  

---

### **2. ChatGPT 기반 순위화 및 점수 부여**  
1. **동시 비교 평가**:  
   - 동일 원본에서 생성된 **6개 변형 샘플**을 ChatGPT에 한 번에 제공해 **상대적 순위**와 **점수**를 부여받습니다(그림 1 참조).  
   - *프롬프트 예시*:  
     ```  
     "다음 6개 지시문을 복잡성 순서대로 1~6위로 나열하고, 각각에 1~10점을 부여하세요."  
     ```  

2. **세밀한 점수 차별화**:  
   - 단일 샘플 독립 평가 대신 **변형 샘플 그룹 비교**를 통해 미세한 점수 차이 포착(§2.3, §2.4).  

---

### **3. 고품질 데이터셋 구축 방식**  
1. **직접 추가가 아닌 간접 선별**:  
   - 생성된 변형 샘플 자체를 데이터셋에 추가하지 **않습니다**.  
   - 대신, 변형 샘플의 ChatGPT 점수로 **복잡성·품질 판별기**를 학습시킵니다(§2.3-2.4).  