In [12]:
%pip install anthropic
%pip install rich
%pip install python-dotenv

[31mERROR: Invalid requirement: 'anthropic,': Expected end or semicolon (after name and no valid version specifier)
    anthropic,
             ^[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [19]:
from openai import OpenAI
from dotenv import load_dotenv
import os
from typing import Dict, List

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_completion(messages: List[Dict[str, str]]) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_completion_tokens=4092, 
    )
    return response.choices[0].message.content

In [8]:
import re
from typing import Dict, List
from pathlib import Path

class MarkdownSectionParser:
    def __init__(self):
        self.section_pattern = r'^#+ .*$'  # '#'으로 시작하는 헤더 패턴
        self.end_sections = {'ACKNOWLEDGEMENTS', 'REFERENCES', 'CONCLUSION', 'CONCLUSIONS'}
        
    def parse_sections(self, markdown_path: str) -> Dict[str, str]:
        sections = {}
        current_section = None
        current_content = []
        
        with open(markdown_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        for line in lines:
            if re.match(self.section_pattern, line):
                section_title = line.strip('# \n')
                
                if current_section and current_content:
                    sections[current_section] = ''.join(current_content).strip()
                
                # 대문자로 변환하여 비교
                if any(end_sec in section_title.upper() for end_sec in self.end_sections):
                    break
                    
                current_section = section_title
                current_content = []
            else:
                if current_section is None and 'ABSTRACT' not in line.upper():
                    continue
                current_content.append(line)
        
        # 마지막 섹션 저장
        if current_section and current_content:
            sections[current_section] = ''.join(current_content).strip()
        
        # Abstract 필터링
        filtered_sections = {}
        include_section = False
        
        for section, content in sections.items():
            if 'ABSTRACT' in section.upper():
                include_section = True
            
            if include_section:
                filtered_sections[section] = content
                
        return filtered_sections

In [3]:
from anthropic import Anthropic
from typing import Dict, List
import time
from tqdm import tqdm

class PaperExplainer:
    def __init__(self):
        self.conversation_history = []
        self.delay = 1  # API 호출 간 딜레이 (초)
        
    def _create_section_prompt(self, section_title: str, section_content: str, is_first: bool = False) -> str:
        if is_first:
            return f"""You are an expert academic paper explainer. Please explain the following section '{section_title}' 
            from an academic paper in a clear and concise manner. Please explain in Korean.

            Section content:
            {section_content}"""
        else:
            return f"""Based on our previous discussion of the paper, please explain the following section '{section_title}'.
            
            Section content:
            {section_content}"""
    
    def explain_section(self, section_title: str, section_content: str) -> str:
        try:
            # Create prompt based on whether this is the first section
            is_first = len(self.conversation_history) == 0
            prompt = self._create_section_prompt(section_title, section_content, is_first)
            
            # Add previous conversation for context
            self.conversation_history.append({"role": "user", "content": prompt})
            
            response = get_completion(self.conversation_history)
            
            # Update conversation history
            self.conversation_history.append({"role": "assistant", "content": response})            
            # API 호출 간 딜레이
            time.sleep(self.delay)
            
            return response
            
        except Exception as e:
            print(f"Error explaining section {section_title}: {str(e)}")
            return f"Error: Failed to explain section {section_title}"

    def explain_paper(self, sections: Dict[str, str]) -> Dict[str, str]:
        """
        논문의 각 섹션을 순차적으로 설명
        
        Args:
            sections: 섹션 제목과 내용을 매핑한 딕셔너리
            
        Returns:
            섹션 제목과 설명을 매핑한 딕셔너리
        """
        explanations = {}
        
        print("\nProcessing sections:")
        for title, content in tqdm(sections.items(), desc="Explaining sections"):
            print(f"\nProcessing: {title}")
            explanation = self.explain_section(title, content)
            explanations[title] = explanation
            
        return explanations

In [42]:

from typing import Optional, List
class PaperQA:
    def __init__(self, context: Optional[List[Dict[str, str]]] = None):
        self.conversation_history = context or []
        self.delay = 1
        
    def load_paper_context(self, explanations: Dict[str, str]):
        """논문 설명을 대화 기록에 로드"""
        context = "Here's the paper summary:\n\n"
        for section, explanation in explanations.items():
            context += f"## {section}\n{explanation}\n\n"
            
        # 논문 컨텍스트를 대화 기록에 추가
        self.conversation_history.append({"role": "user", "content": context})

    def ask_question(self, question: str) -> str:
        """논문에 대한 질문에 답변"""
        try:
            # 질문 프롬프트 생성
            prompt = f"""Based on the paper we discussed, please answer the following question in Korean. 
            Be specific and cite relevant sections when possible.

            Question: {question}"""
            
            # 이전 대화 기록과 함께 질문 전송
            self.conversation_history.append({"role": "user", "content": prompt})
            
            # Claude에 질문
            response = get_completion(self.conversation_history)
            
            # 대화 기록 업데이트
            self.conversation_history.append({"role": "assistant", "content": response})
            
            time.sleep(self.delay)
            return response
            
        except Exception as e:
            print(f"Error processing question: {str(e)}")
            return f"Error: Failed to process question"
    
    def view_conversation_history(self, start_idx: int = 0, end_idx: Optional[int] = None) -> None:
        """대화 내역을 출력하는 함수
        
        Args:
            start_idx: 시작 인덱스 (기본값: 0)
            end_idx: 종료 인덱스 (기본값: None, None일 경우 끝까지 출력)
        """
        # 논문 컨텍스트는 제외하고 실제 대화만 출력
        conversations = [
            msg for msg in self.conversation_history 
            if not msg["content"].startswith("Here's the paper summary:")
        ]
        
        # end_idx가 None이면 리스트 끝까지
        end_idx = end_idx if end_idx is not None else len(conversations)
        
        print("\n=== 대화 내역 ===\n")
        for i, msg in enumerate(conversations[start_idx:end_idx], start=start_idx):
            role = msg["role"].upper()
            if role == "ASSISTANT":
                print(f"\n🤖 Assistant ({i}):\n{msg['content']}\n")
                print("-" * 80)
            elif role == "USER":
                print(f"\n👤 User ({i}):\n{msg['content']}\n")
                print("-" * 80)
    
    
    def get_last_n_conversations(self, n: int = 1) -> None:
        """최근 n개의 대화 내역을 출력
        
        Args:
            n: 출력할 최근 대화 개수 (기본값: 1)
        """
        conversations = [
            msg for msg in self.conversation_history 
            if not msg["content"].startswith("Here's the paper summary:")
        ]
        start_idx = max(0, len(conversations) - n)
        self.view_conversation_history(start_idx)

In [5]:
from typing import Tuple

def process_paper(markdown_path: str) -> Tuple[Dict[str, str], PaperQA]:
    """
    전체 논문 처리 프로세스
    """
    # 1. Markdown 파싱
    parser = MarkdownSectionParser()
    sections = parser.parse_sections(markdown_path)
    
    print(sections)
    # 2. 섹션별 설명 생성
    explainer = PaperExplainer()
    explanations = explainer.explain_paper(sections)

    # 3. 질문 답변 준비 
    qa = PaperQA()
    qa.load_paper_context(explanations)

    return explanations, qa

In [47]:
from dotenv import load_dotenv
import os

load_dotenv()

markdown_path = "input_file/INSTRUCTION TUNING WITH GPT-4.md"
output_dir = "output_file"

explanations, qa = process_paper(markdown_path)

{'ABSTRACT': 'Prior work has shown that finetuning large language models (LLMs) using machinegenerated instruction-following data enables such models to achieve remarkable zero-shot capabilities on new tasks, and no human-written instructions are needed. In this paper, we present the first attempt to use GPT-4 to generate instructionfollowing data for LLM finetuning. Our early experiments on instruction-tuned LLaMA models show that the 52K English and Chinese instruction-following data generated by GPT-4 leads to superior zero-shot performance on new tasks to the instruction-following data generated by previous state-of-the-art models. We also collect feedback and comparison data from GPT-4 to enable a comprehensive evaluation and reward model training. We make our data generated using GPT-4 as well as our codebase publicly available. 1', 'INTRODUCTION': 'Large Language Models (LLMs) have shown impressive generalization capabilities such as incontext-learning (Brown et al., 2020) and c

Explaining sections:   0%|          | 0/10 [00:00<?, ?it/s]


Processing: ABSTRACT


Explaining sections:  10%|█         | 1/10 [00:04<00:41,  4.59s/it]


Processing: INTRODUCTION


Explaining sections:  20%|██        | 2/10 [00:19<01:27, 10.94s/it]


Processing: 2 DATASET


Explaining sections:  30%|███       | 3/10 [00:37<01:37, 13.86s/it]


Processing: 3 INSTRUCTION-TUNING LANGUAGE MODELS


Explaining sections:  40%|████      | 4/10 [00:39<00:55,  9.29s/it]


Processing: 3.1 SELF-INSTRUCT TUNING


Explaining sections:  50%|█████     | 5/10 [00:44<00:39,  7.84s/it]


Processing: 3.2 REWARD MODELS


Explaining sections:  60%|██████    | 6/10 [00:55<00:34,  8.62s/it]


Processing: 4 EXPERIMENTAL RESULTS


Explaining sections:  70%|███████   | 7/10 [00:57<00:19,  6.60s/it]


Processing: 4.1 BENCHMARKS


Explaining sections:  80%|████████  | 8/10 [01:13<00:19,  9.76s/it]


Processing: 4.3 COMPARISONS WITH SOTA USING AUTOMATIC EVALUATION


Explaining sections:  90%|█████████ | 9/10 [01:26<00:10, 10.52s/it]


Processing: 5 RELATED WORK


Explaining sections: 100%|██████████| 10/10 [01:44<00:00, 10.47s/it]


In [48]:
output_dir = "output_file"

input_filename = Path(markdown_path).stem  # 파일 이름만 추출 (확장자 제외)
output_path = os.path.join(output_dir, f"{input_filename}_explained.md")

with open(output_path, 'w', encoding='utf-8') as f:
    for section, explanation in explanations.items():
        f.write(f"\n## {section}\n\n")
        f.write(explanation)
        f.write("\n\n---\n")

print(f"\nExplanations saved to: {output_path}")


Explanations saved to: output_file/INSTRUCTION TUNING WITH GPT-4_explained.md


In [49]:
from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
from rich.syntax import Syntax
from rich.table import Table
from typing import Dict
import os

class MarkdownPrinter:
    def __init__(self):
        self.console = Console()
        
    def print_markdown_file(self, file_path: str):
        """마크다운 파일을 이쁘게 출력"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                markdown_content = f.read()
            
            # 마크다운 렌더링
            md = Markdown(markdown_content)
            
            # 마크다운 내용 출력
            self.console.print(md)
            
        except Exception as e:
            self.console.print(f"[bold red]Error reading file: {str(e)}[/]")
            
    def print_sections(self, sections: Dict[str, str]):
        """섹션별로 구분하여 출력"""
        for section, content in sections.items():
            # 섹션 제목
            self.console.print("\n")
            self.console.print(Panel(
                f"[bold cyan]{section}[/]",
                border_style="cyan"
            ))
            
            # 섹션 내용
            md = Markdown(content)
            self.console.print(md)
            
            # 구분선
            self.console.print("[dim]" + "="*80 + "[/]")

printer = MarkdownPrinter()

# 마크다운 파일 출력
printer.print_markdown_file(output_path)

In [43]:
question = """ 
원본 데이터 셋을 샘플링해서 입력 시퀀스에 조건화를 한 후 새로운 데이터를 생성하는 것 같은데 이게 맞아? 그리고 조건화가 뭐임?
"""

qa = PaperQA(qa.conversation_history)
response = qa.ask_question(question)
print(response)

네, 맞습니다. ReST에서 새로운 데이터를 생성하는 과정은 원본 데이터셋을 샘플링하여 입력 시퀀스를 선택하고, 이를 기반으로 조건화된 새로운 출력 시퀀스를 생성하는 방식으로 이루어집니다. 여기서 "조건화"라는 것은 주어진 입력 시퀀스 \(\pmb{x}\)에 대해 그 조건 하에서 가능한 출력 시퀀스 \(\pmb{y}\)를 생성한다는 의미입니다.

조건화는 확률적 모델링에서 매우 중요한 개념인데, 이는 주어진 입력이 어떤 특정한 상태나 값일 때의 출력 확률을 뜻합니다. 언어 모델링에서는 입력 시퀀스(또는 문맥)가 주어졌을 때 다음 가능한 출력 시퀀스를 생성하는 과정을 말합니다. 이 과정은 자동 회귀 모델을 사용하여 구현되며, 모델이 이미 학습한 확률 분포에 기반해 \(\pi_{\theta}(\pmb{y}|\pmb{x})=\prod_{t=1}^{T}\pi_{\theta}(y_{t}|\pmb{y}_{1:t-1}, \pmb{x})\) 형태로 다음 토큰 \(y_t\)를 계산합니다.

ReST에서는 이를 통해 현재 정책 \(\pi_{\theta}\)로 입력 시퀀스에 맞는 새로운 출력 시퀀스를 생성하고, 그 결과를 데이터셋에 추가하여 데이터를 확장시킵니다. 이러한 방식은 모델이 이미 알고 있는 문맥에 기반해 추가적인 학습 데이터를 생성할 수 있도록 하며, 이는 Grow 단계에서 원본 데이터셋의 샘플을 이용해 새로운 시퀀스를 생성하는 과정에서 핵심적인 역할을 합니다. 이는 Section 3의 내용을 바탕으로 설명한 것입니다.
