In [12]:
%pip install anthropic

[31mERROR: Invalid requirement: 'anthropic,': Expected end or semicolon (after name and no valid version specifier)
    anthropic,
             ^[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [13]:
%pip install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [36]:
from openai import OpenAI
from dotenv import load_dotenv
import os
from typing import Dict, List

load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def get_completion(messages: List[Dict[str, str]]) -> str:
    response = client.chat.completions.create(
        model="o1-preview-2024-09-12",
        messages=messages,
        max_completion_tokens=25000, 
    )
    return response.choices[0].message.content

In [16]:
import re
from typing import Dict, List
from pathlib import Path

class MarkdownSectionParser:
    def __init__(self):
        self.section_pattern = r'^#+ .*$'  # '#'으로 시작하는 헤더 패턴
        self.end_sections = {'Acknowledgements', 'References', 'Conclusion', 'Conclusions'}
        
    def parse_sections(self, markdown_path: str) -> Dict[str, str]:
        """Markdown 파일을 섹션별로 파싱"""
        sections = {}
        current_section = None
        current_content = []
        
        with open(markdown_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        for line in lines:
            # 헤더 라인 확인
            if re.match(self.section_pattern, line):
                section_title = line.strip('# \n')
                
                # 이전 섹션 저장
                if current_section and current_content:
                    sections[current_section] = ''.join(current_content).strip()
                
                # 종료 섹션이면 중단
                if any(end_sec in section_title for end_sec in self.end_sections):
                    break
                    
                # 새 섹션 시작
                current_section = section_title
                current_content = []
            else:
                # Abstract 섹션 이전의 내용은 무시
                if current_section is None and 'Abstract' not in line:
                    continue
                current_content.append(line)
        
        # 마지막 유효 섹션 저장
        if current_section and current_content and not any(end_sec in current_section for end_sec in self.end_sections):
            sections[current_section] = ''.join(current_content).strip()
        
        # Abstract부터 필터링
        filtered_sections = {}
        include_section = False
        
        for section, content in sections.items():
            if 'Abstract' in section:
                include_section = True
            
            if include_section:
                filtered_sections[section] = content
                
        return filtered_sections

parser = MarkdownSectionParser()
markdown_path = "input_file/Enhancing Chat Language Models by Scaling High-quality Instructional Conversations.md"

# Markdown 파싱
sections = parser.parse_sections(markdown_path)

{'Abstract': 'Fine-tuning on instruction data has been widely validated as an effective practice for implementing chat language models like ChatGPT. Scaling the diversity and quality of such data, although straightforward, stands a great chance of leading to improved performance. This paper aims to improve the upper bound of open-source models further. We first provide a systematically designed, diverse, informative, large-scale dataset of instructional conversations, UltraChat, which does not involve human queries. Our objective is to capture the breadth of interactions that a human might have with an AI assistant and employs a comprehensive framework to generate multi-turn conversation iteratively. UltraChat contains 1.5 million highquality multi-turn dialogues and covers a wide range of topics and instructions. Our statistical analysis of UltraChat reveals its superiority in various key metrics, including scale, average length, diversity, coherence, etc., solidifying its position as

In [37]:
from anthropic import Anthropic
from typing import Dict, List
import time
from tqdm import tqdm

class PaperExplainer:
    def __init__(self):
        self.conversation_history = []
        self.delay = 1  # API 호출 간 딜레이 (초)
        
    def _create_section_prompt(self, section_title: str, section_content: str, is_first: bool = False) -> str:
        if is_first:
            return f"""You are an expert academic paper explainer. Please explain the following section '{section_title}' 
            from an academic paper in a clear and concise manner. Please explain in Korean.

            Section content:
            {section_content}"""
        else:
            return f"""Based on our previous discussion of the paper, please explain the following section '{section_title}'.
            
            Section content:
            {section_content}"""
    
    def explain_section(self, section_title: str, section_content: str) -> str:
        try:
            # Create prompt based on whether this is the first section
            is_first = len(self.conversation_history) == 0
            prompt = self._create_section_prompt(section_title, section_content, is_first)
            
            # Add previous conversation for context
            self.conversation_history.append({"role": "user", "content": prompt})
            
            response = get_completion(self.conversation_history)
            
            # Update conversation history
            self.conversation_history.append({"role": "assistant", "content": response})            
            # API 호출 간 딜레이
            time.sleep(self.delay)
            
            return response
            
        except Exception as e:
            print(f"Error explaining section {section_title}: {str(e)}")
            return f"Error: Failed to explain section {section_title}"

    def explain_paper(self, sections: Dict[str, str]) -> Dict[str, str]:
        """
        논문의 각 섹션을 순차적으로 설명
        
        Args:
            sections: 섹션 제목과 내용을 매핑한 딕셔너리
            
        Returns:
            섹션 제목과 설명을 매핑한 딕셔너리
        """
        explanations = {}
        
        print("\nProcessing sections:")
        for title, content in tqdm(sections.items(), desc="Explaining sections"):
            print(f"\nProcessing: {title}")
            explanation = self.explain_section(title, content)
            explanations[title] = explanation
            
        return explanations

In [38]:
class PaperQA:
    def __init__(self):
        self.conversation_history = []
        self.delay = 1
        
    def load_paper_context(self, explanations: Dict[str, str]):
        """논문 설명을 대화 기록에 로드"""
        context = "Here's the paper summary:\n\n"
        for section, explanation in explanations.items():
            context += f"## {section}\n{explanation}\n\n"
            
        # 논문 컨텍스트를 대화 기록에 추가
        self.conversation_history.extend([
            {"role": "system", "content": "You are an expert academic paper assistant who has thoroughly read and understood this paper."},
            {"role": "user", "content": context}
        ])

    def ask_question(self, question: str) -> str:
        """논문에 대한 질문에 답변"""
        try:
            # 질문 프롬프트 생성
            prompt = f"""Based on the paper we discussed, please answer the following question in Korean. 
            Be specific and cite relevant sections when possible.

            Question: {question}"""
            
            # 이전 대화 기록과 함께 질문 전송
            self.conversation_history.append({"role": "user", "content": prompt})
            
            # Claude에 질문
            response = get_completion(self.conversation_history)
            
            # 대화 기록 업데이트
            self.conversation_history.append({"role": "assistant", "content": response})
            
            time.sleep(self.delay)
            return response
            
        except Exception as e:
            print(f"Error processing question: {str(e)}")
            return f"Error: Failed to process question"


In [39]:
from typing import Tuple

def process_paper(markdown_path: str) -> Tuple[Dict[str, str], PaperQA]:
    """
    전체 논문 처리 프로세스
    """
    # 1. Markdown 파싱
    parser = MarkdownSectionParser()
    sections = parser.parse_sections(markdown_path)
    
    # 2. 섹션별 설명 생성
    explainer = PaperExplainer()
    explanations = explainer.explain_paper(sections)

    # 3. 질문 답변 준비 
    qa = PaperQA()
    qa.load_paper_context(explanations)

    return explanations, qa

In [40]:
from dotenv import load_dotenv
import os

load_dotenv()

markdown_path = "input_file/Enhancing Chat Language Models by Scaling High-quality Instructional Conversations.md"
output_dir = "output_file"

explanations, qa = process_paper(markdown_path)


Processing sections:


Explaining sections:   0%|          | 0/19 [00:00<?, ?it/s]


Processing: Abstract


Explaining sections:   5%|▌         | 1/19 [00:13<04:02, 13.46s/it]


Processing: 1 Introduction


Explaining sections:  11%|█         | 2/19 [00:37<05:29, 19.40s/it]


Processing: 2 Related Work


Explaining sections:  16%|█▌        | 3/19 [01:03<06:00, 22.54s/it]


Processing: 3 Design


Explaining sections:  21%|██        | 4/19 [01:16<04:41, 18.80s/it]


Processing: 3.1 Principle


Explaining sections:  26%|██▋       | 5/19 [01:36<04:27, 19.14s/it]


Processing: 4 Data Construction


Explaining sections:  32%|███▏      | 6/19 [01:49<03:44, 17.23s/it]


Processing: 4.1 Questions about the World


Explaining sections:  37%|███▋      | 7/19 [02:04<03:17, 16.48s/it]


Processing: 4.2 Creation and Generation


Explaining sections:  42%|████▏     | 8/19 [02:16<02:45, 15.02s/it]


Processing: 4.3 Assistance on Existing Materials


Explaining sections:  47%|████▋     | 9/19 [02:37<02:50, 17.04s/it]


Processing: Templates for concatenation


Explaining sections:  53%|█████▎    | 10/19 [02:56<02:37, 17.51s/it]


Processing: 4.4 User Simulation and Refinement


Explaining sections:  58%|█████▊    | 11/19 [03:12<02:16, 17.12s/it]


Processing: 4.5 Data Analysis


Explaining sections:  63%|██████▎   | 12/19 [03:45<02:33, 21.99s/it]


Processing: 4.6 UltraLLaMA


Explaining sections:  68%|██████▊   | 13/19 [04:03<02:03, 20.53s/it]


Processing: 5 Evaluation


Explaining sections:  74%|███████▎  | 14/19 [04:19<01:36, 19.30s/it]


Processing: 5.1 Baselines


Explaining sections:  79%|███████▉  | 15/19 [04:53<01:34, 23.74s/it]


Processing: 5.2 Response Comparison


Explaining sections:  84%|████████▍ | 16/19 [05:24<01:17, 25.91s/it]


Processing: 5.3 Independent Scoring


Explaining sections:  89%|████████▉ | 17/19 [05:45<00:48, 24.33s/it]


Processing: 5.4 TruthfulQA Results


Explaining sections:  95%|█████████▍| 18/19 [06:02<00:22, 22.22s/it]


Processing: 5.5 The Impact of System Prompts


Explaining sections: 100%|██████████| 19/19 [06:21<00:00, 20.05s/it]


FileNotFoundError: [Errno 2] No such file or directory: 'output/Enhancing Chat Language Models by Scaling High-quality Instructional Conversations_explained.md'

In [41]:
output_dir = "output_file"

input_filename = Path(markdown_path).stem  # 파일 이름만 추출 (확장자 제외)
output_path = os.path.join(output_dir, f"{input_filename}_explained.md")

with open(output_path, 'w', encoding='utf-8') as f:
    for section, explanation in explanations.items():
        f.write(f"\n## {section}\n\n")
        f.write(explanation)
        f.write("\n\n---\n")

print(f"\nExplanations saved to: {output_path}")


Explanations saved to: output_file/Enhancing Chat Language Models by Scaling High-quality Instructional Conversations_explained.md


In [21]:
%pip install rich

Collecting rich
  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting markdown-it-py>=2.2.0 (from rich)
  Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich)
  Using cached mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)
Downloading rich-13.9.4-py3-none-any.whl (242 kB)
Using cached markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
Using cached mdurl-0.1.2-py3-none-any.whl (10.0 kB)
Installing collected packages: mdurl, markdown-it-py, rich
Successfully installed markdown-it-py-3.0.0 mdurl-0.1.2 rich-13.9.4
Note: you may need to restart the kernel to use updated packages.


In [25]:
from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
from rich.syntax import Syntax
from rich.table import Table
from typing import Dict
import os

class MarkdownPrinter:
    def __init__(self):
        self.console = Console()
        
    def print_markdown_file(self, file_path: str):
        """마크다운 파일을 이쁘게 출력"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                markdown_content = f.read()
            
            # 마크다운 렌더링
            md = Markdown(markdown_content)
            
            # 마크다운 내용 출력
            self.console.print(md)
            
        except Exception as e:
            self.console.print(f"[bold red]Error reading file: {str(e)}[/]")
            
    def print_sections(self, sections: Dict[str, str]):
        """섹션별로 구분하여 출력"""
        for section, content in sections.items():
            # 섹션 제목
            self.console.print("\n")
            self.console.print(Panel(
                f"[bold cyan]{section}[/]",
                border_style="cyan"
            ))
            
            # 섹션 내용
            md = Markdown(content)
            self.console.print(md)
            
            # 구분선
            self.console.print("[dim]" + "="*80 + "[/]")

printer = MarkdownPrinter()

# 마크다운 파일 출력
markdown_path = "output_file/2412.08445v1_explained.md"
printer.print_markdown_file(markdown_path)

In [None]:
qa.ask_question("What is the main idea of the paper?")