In [1]:
%pip install "yt-dlp[default]"

Collecting yt-dlp[default]
  Downloading yt_dlp-2025.1.15-py3-none-any.whl.metadata (172 kB)
Collecting brotli (from yt-dlp[default])
  Using cached Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl.metadata (5.5 kB)
Collecting mutagen (from yt-dlp[default])
  Using cached mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pycryptodomex (from yt-dlp[default])
  Downloading pycryptodomex-3.21.0-cp36-abi3-macosx_10_9_universal2.whl.metadata (3.4 kB)
Using cached Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl (815 kB)
Using cached mutagen-1.47.0-py3-none-any.whl (194 kB)
Downloading pycryptodomex-3.21.0-cp36-abi3-macosx_10_9_universal2.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.1.15-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packa

In [62]:
%pip install pydub

170743.23s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting pydub
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Note: you may need to restart the kernel to use updated packages.


In [21]:
from yt_dlp import YoutubeDL
from urllib.parse import urlparse, parse_qs
import os 
import glob


ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
    'outtmpl': 'audio/%(title)s-%(id)s.%(ext)s',
}

def download_audio(youtube_url: str) -> str:
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
        
    video_id = extract_video_id(youtube_url)
    return find_audio_file(video_id)

def find_audio_file(video_id: str, audio_dir: str = "audio") -> str:
    """
    video_id를 포함하는 오디오 파일을 찾아 경로를 반환
    
    Args:
        video_id: YouTube 비디오 ID
        audio_dir: 오디오 파일이 저장된 디렉토리 (기본값: "audio")
        
    Returns:
        str: 찾은 파일의 경로
        
    Raises:
        FileNotFoundError: 파일을 찾지 못한 경우
    """
    # audio 디렉토리의 모든 mp3 파일 검색
    pattern = os.path.join(audio_dir, f"*{video_id}*.mp3")
    matching_files = glob.glob(pattern)
    
    if not matching_files:
        raise FileNotFoundError(f"No audio file found for video ID: {video_id}")
    
    # 일치하는 파일이 여러 개인 경우 가장 최근 파일 반환
    return max(matching_files, key=os.path.getctime)


def extract_video_id(youtube_url: str) -> str:
    """
    YouTube URL에서 video ID를 추출
    
    Args:
        youtube_url: YouTube 영상 URL
        
    Returns:
        str: YouTube 비디오 ID
        
    Raises:
        ValueError: 올바르지 않은 YouTube URL이거나 video ID를 찾을 수 없는 경우
    """
    try:
        # URL 파싱
        parsed_url = urlparse(youtube_url)
        # 쿼리 파라미터 파싱
        query_params = parse_qs(parsed_url.query)
        
        # 'v' 파라미터에서 video ID 추출
        if 'v' in query_params:
            return query_params['v'][0]
        
        raise ValueError("Could not extract video ID from URL")
        
    except Exception as e:
        raise ValueError(f"Invalid YouTube URL: {str(e)}")



In [66]:
from openai import OpenAI
from dotenv import load_dotenv
from pathlib import Path
import os
from pydub import AudioSegment
import tempfile

class AudioTranscriber:
    def __init__(self):
        load_dotenv()
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        self.chunk_duration = 5 * 60 * 1000  # 명시적으로 클래스 속성으로 정의
    
    def transcribe_audio(self, audio_path: str, language: str = None) -> str:
        """
        음성 파일을 텍스트로 변환
        
        Args:
            audio_path: 음성 파일 경로
            language: 음성의 언어 (예: 'ko', 'en', None). None일 경우 자동 감지
        
        Returns:
            str: 변환된 텍스트
        """
        try:
            audio_file = Path(audio_path)
            if not audio_file.exists():
                raise FileNotFoundError(f"Audio file not found: {audio_path}")
            
            # 오디오 파일을 청크로 분할
            chunks = self.split_audio(audio_path)
            transcribed_texts = []
            
            # 각 청크를 순차적으로 처리
            for i, chunk_path in enumerate(chunks):
                print(f"Processing chunk {i+1}/{len(chunks)}...")
                text = self.transcribe_chunk(chunk_path, language)
                if text:
                    transcribed_texts.append(text)
                    
            # 임시 파일들 정리
            for chunk_path in chunks:
                try:
                    os.remove(chunk_path)
                except Exception as e:
                    print(f"Error removing temporary file {chunk_path}: {str(e)}")
            try:
                os.rmdir(os.path.dirname(chunks[0]))  # 임시 디렉토리 제거
            except Exception as e:
                print(f"Error removing temporary directory: {str(e)}")
            
            # 모든 텍스트 합치기
            return " ".join(transcribed_texts)
            
        except Exception as e:
            print(f"Error during transcription: {str(e)}")
            return None

    
    def split_audio(self, audio_path: str) -> list:
            """
            오디오 파일을 5분 단위로 분할
            
            Args:
                audio_path: 오디오 파일 경로
                
            Returns:
                list: 임시 파일 경로 리스트
            """
            audio = AudioSegment.from_file(audio_path)
            chunks = []
            
            # 임시 디렉토리 생성
            temp_dir = tempfile.mkdtemp()
            
            for i in range(0, len(audio), self.chunk_duration):
                chunk = audio[i:i + self.chunk_duration]
                chunk_path = os.path.join(temp_dir, f"chunk_{i}.mp3")
                chunk.export(chunk_path, format="mp3")
                chunks.append(chunk_path)
                
            return chunks
        
    def transcribe_chunk(self, chunk_path: str, language: str = None) -> str:
        """
        단일 오디오 청크를 텍스트로 변환
        
        Args:
            chunk_path: 청크 파일 경로
            language: 음성의 언어
            
        Returns:
            str: 변환된 텍스트
        """
        try:
            with open(chunk_path, "rb") as file:
                response = self.client.audio.transcriptions.create(
                    model="whisper-1",
                    file=file,
                    language=language,
                    response_format="text"
                )
            return response
        except Exception as e:
            print(f"Error transcribing chunk {chunk_path}: {str(e)}")
            raise e

    def save_text(self, text: str, file_name: str, output_dir: str = "audio_text") -> None:
        """
        텍스트를 파일로 저장
        
        Args:
            text: 저장할 텍스트
            output_path: 저장할 파일 경로
        """
        with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as f:
            f.write(text)

In [70]:

import os 
from openai import OpenAI
from dotenv import load_dotenv
import json

load_dotenv(override=True)

api_key = os.getenv("DEEPSEEK_API_KEY")
client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")
model_name = "deepseek-chat"

def get_completion_openai(messages):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        temperature=0.3, 
        max_tokens=8192, 
    )
    return response.choices[0].message.content

def get_completion(messages, model_name=model_name):
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.3, 
        max_tokens=8192, 
    )
    return response.choices[0].message.content

def get_completion_json_output(messages, model_name=model_name):
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.3, 
        max_tokens=8192, 
        response_format={"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)

In [14]:
class TextStructurer:    
    def structure_text(self, text: str) -> str:
        """
        텍스트를 마크다운 형식으로 구조화
        
        Args:
            text: 변환할 텍스트
            
        Returns:
            str: 마크다운 형식으로 구조화된 텍스트
        """
        prompt = """
        다음 텍스트를 분석하여 주요 섹션별로 나누고 마크다운 형식으로 구조화해주세요.
        
        다음 규칙을 따라주세요:
        1. 주요 섹션은 ## (h2)로 시작
        2. 하위 섹션은 ### (h3)로 시작
        3. 중요한 포인트는 볼드체(**) 사용
        4. 리스트 항목은 적절히 구분하여 표시
        5. 핵심 개념이나 용어는 `코드 형식`으로 강조
        
        원본 텍스트:
        {text}
        
        마크다운 형식으로 구조화된 결과를 제공해주세요. 
        
        ```markdown 없이 구조화된 마크다운 형식으로 텍스트를 제공하시면 됩니다. 모든 내용을 빠짐없이 구조화해주세요.: 
        """
        
        try:
            response = get_completion([
                {"role": "system", "content": "You are an expert at structuring text into well-organized markdown sections."},
                {"role": "user", "content": prompt.format(text=text)}
            ])
            
            return response
            
        except Exception as e:
            print(f"Error during text structuring: {str(e)}")
            return None
    
    def save_markdown(self, file_name: str, markdown_text: str, output_dir: str = "markdown_input") -> None:
        """
        마크다운 텍스트를 파일로 저장
        
        Args:
            file_name: 저장할 파일 이름
            markdown_text: 저장할 마크다운 텍스트
            output_dir: 저장할 디렉토리 (기본값: "markdown_input")
        """
        try:
            with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as f:
                f.write(markdown_text)
            print(f"Markdown saved to: {os.path.join(output_dir, file_name)}")
        except Exception as e:
            print(f"Error saving markdown: {str(e)}")


In [69]:
from typing import List, Dict

class TwoStepTextStructurer:
    def __init__(self):
        pass
    
    def extract_sections(self, text: str) -> str:
        """
        1단계: 긴 텍스트에 대해 주요 섹션을 뽑아주세요.
        """
        prompt = f"""
        아래 긴 텍스트를 주요 섹션 별로 나눠주고 이 섹션은 어떤 것인지 간단하게 설명해주세요.
        
        어떠한 주요 섹션이라도 놓치지 말고 모두 뽑아주세요. 
        
        텍스트:
        {text}
        
        Example Output 은 다음과 같이 json 형식으로 제공해주세요. 
        {{
            "sections": [
                {{
                    "title": "section1",
                    "description": "section1 description"
                }},
                {{
                    "title": "section2",
                    "description": "section2 description"
                }},
                {{
                    "title": "section3",
                    "description": "section3 description"
                }},
                ...
            ],
        }}
        """
        
        try:
            response = get_completion_json_output([
                {"role": "system", "content": "You are an expert at analyzing text and organizing it into key sections, breaking down the content into well-structured segments."},
                {"role": "user", "content": prompt}
            ])
            return response["sections"]
        except Exception as e:
            print(f"Error during extract sections: {str(e)}")
            return ""
    
    def structure_text_by_sections(self, text: str, sections: List[Dict[str, str]]) -> str:
        """
        2단계: 1단계 요약에서 뽑아낸 섹션 정보를 토대로,
        텍스트를 섹션 단위로 분할하여 각각 마크다운으로 구조화.
        최종 구조화된 텍스트를 합쳐서 반환.
        """
        
        # 섹션별 텍스트를 추출하기 (필요 시 chunking)
        # 실제로는 find_section_text, chunk_text 등의 함수를 구현해야 함
        # 여기서는 일단 원본 전체 텍스트를 그대로 쓰는 예시
        structured_sections = []
        
        for section in sections:
            title = section["title"]
            description = section["description"]
            # 프롬프트 준비
            prompt = f"""
            원본 텍스트에서 섹션'{title}'에 해당하는 내용을 뽑아주세요.
            
            섹션 설명인 '{description}'을 참고해서 관련된 내용을 모두 빠짐없이 뽑아주시면 됩니다. 
            
            다음은 섹션 '{title}'에 해당하는 원본 텍스트입니다:
            
            {text}
            
            섹션에 해당하는 내용을 뽑아올 때 이 텍스트를 마크다운 형식으로 구조화해주세요.
            규칙:
            1. 섹션 제목은 ## (h2)
            2. 하위 섹션은 ### (h3)
            3. 중요한 포인트는 **볼드체**
            4. 리스트 항목은 - 또는 * 사용
            5. 코드나 핵심 용어는 `백틱`으로 감싸 강조
            6. ```markdown 없이 구조화된 마크다운 형식으로 텍스트를 제공하시면 됩니다. 모든 내용을 빠짐없이 구조화해주세요.: 
            """
            
            try:
                response = get_completion([
                    {"role": "system", "content": "You are an expert at structuring text into markdown."},
                    {"role": "user", "content": prompt}
                ])
                
                # 섹션별 구조화된 결과를 리스트에 담는다
                structured_sections.append(f"{response}\n")
                
            except Exception as e:
                print(f"Error during section structuring: {str(e)}")
                continue
        
        # 모든 섹션 구조화 결과를 하나의 문자열로 합치기
        final_result = "\n".join(structured_sections)
        return final_result
    
    def run_two_step_structure(self, text: str) -> str:
        """
        사용 예시 메서드:
        1) 섹션 추출 -> 2) 섹션별 마크다운 구조화
        """
        sections = self.extract_sections(text)
        structured_result = self.structure_text_by_sections(text, sections)
        return structured_result
    
    
    def save_markdown(self, file_name: str, markdown_text: str, output_dir: str = "markdown_input") -> None:
        """
        마크다운 텍스트를 파일로 저장
        
        Args:
            file_name: 저장할 파일 이름
            markdown_text: 저장할 마크다운 텍스트
            output_dir: 저장할 디렉토리 (기본값: "markdown_input")
        """
        try:
            with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as f:
                f.write(markdown_text)
            print(f"Markdown saved to: {os.path.join(output_dir, file_name)}")
        except Exception as e:
            print(f"Error saving markdown: {str(e)}")


In [38]:
text = "The deadline to apply for the first YC Spring Batch is February 11th. If you're accepted, you'll receive $500,000 in investment, plus access to the best startup community in the world. So apply now and come build the future with us. Large language models are getting bigger, much bigger. They're also getting smarter. Over the past few years, AI labs have hit on what feels like a winning strategy. Scaling. More parameters, more data, more compute. Keep scaling the models and they keep improving. You know, just like Moore's law, we saw the doubling in performance every 18 months. With AI, we have now started to see the doubling every six months or so. But could that be coming to an end? Is the era of scaling finally over? Or are we standing right at the beginning of a brand new scaling paradigm, one that promises to revolutionize AI forever? In November of 2019, OpenAI released GPT-2, its largest ever model with one and a half billion parameters. The next summer, they released its successor, GPT-3, which was something we'd never seen before. Not only was GPT-3 far more useful and usable, it was also much bigger, over 100 times bigger than GPT-2. The era of scaling laws had arrived. Before GPT-3, LLMs were already getting bigger. But it was still anyone's guess whether or not that extra size, data and compute would be worth it. There was no guarantee that making your model 100 times bigger would also make it 100 times better. What if they started to run into diminishing returns? It wasn't until January of 2020, when Jared Kaplan, Sam McCandlish, and their colleagues at OpenAI released the influential Scaling Laws for Neural Language Models paper, that the field started to take notice. Think of training AI models like a recipe. You have three main ingredients. The model itself, the data it's trained on, and the compute power used to train it. Larger models have more parameters. These are the internal values of the neural net that are tweaked and trained in order to make predictions. These models are also typically trained on much more data, measured in tokens, which for LLMs are often words or parts of words. Finally, training these larger models requires computing power, which means more GPUs running for longer, using more and more energy. What the Scaling Laws paper revealed was that by cranking up all three, the parameters, the data, and the compute, the result was a smooth, consistent improvement in model performance in the form of a power law. Performance, it turned out, depends much more on scale than on the algorithm. Later in the year, more research from OpenAI confirmed that these Scaling Laws worked for other kinds of models too. On text-to-image, image-to-text, and even math, the same Scaling Laws were there. But back in early 2020, LLM Scaling Laws were pretty much unknown outside of OpenAI. That is, except for one person. The anonymous researcher and writer, Gwern, was one of the first people to hone in on what he called the Scaling Hypothesis. Scale up the size, the data, and the compute, and watch intelligence emerge. Maybe intelligence really is just, like, a lot of compute applied to a lot of data, applied to a lot of parameters. Maybe Moravec and Lag and Kurzweil were right. Gwern's post brought Scaling Laws into the mainstream. And over time, what started as a quiet observation quickly turned into a foundational principle for AI development. But OpenAI's research was just a part of the picture. In 2022, Google DeepMind released their own research on Scaling Laws, and they added an important missing piece. It turned out that it's not just about making models bigger, it's also about making sure you train them on enough data. Researchers were looking to find the most optimal model size and training data for a given compute budget. So they trained over 400 models of different sizes with different amounts of data. And what they found was surprising. Their research suggested that previous LLMs, like GPT-3, were actually under-trained. These models were huge, but they hadn't been trained on enough text to fully realize their potential. To test this, they trained Chinchilla, an LLM less than half the size of GPT-3, but with four times more data. And it won. Chinchilla was far better than models double, even triple, its size. These so-called Chinchilla Scaling Laws meant that training the optimal model wasn't just about making the model larger, but also about having enough data to feed it. Chinchilla was a huge milestone on the road to training the frontier AI models we have today, like GPT-4.0, CLOD 3.5 SONNET, and others. Labs learned they could trust in the scaling laws and get reliably better and better models. So the future of AI is just bigger and bigger models forever, right? Well, recently, there's been plenty of debate within the AI community about whether or not we've finally reached the limits of scaling laws. Some argued that as the latest generation of models have gotten bigger and more expensive, capabilities have started to plateau. There's a lot of debate, in fact, just in the last multiple weeks, though, have we hit the wall with scaling laws? The current generation of LLM models are roughly, you know, a few companies have converged at the top. But I think they're all working on our next versions, too. We're increasing GPUs at the same, like, rate, but we're not getting the intelligence improvements at all. Meanwhile, rumors have leaked out of major labs about failed training runs and diminishing returns. Others have speculated that the lack of high-quality data to train new models has also become a major bottleneck. One practical issue we could have is we could run out of data. For various reasons, I think that's not going to happen. But, you know, if you look at it very, very naively, we're not that far from running out of data. And so it's like, we just don't have the data to continue the scaling curves. So if the old scaling laws are beginning to lose their edge, what comes next? What if there were a new frontier for scaling from a brand new kind of model? OpenAI's new class of reasoning models hints at a potential new direction. In a previous video, we explained how O1 learns to think through complex problems using its own chain of thought. And OpenAI researchers found that the longer O1 was able to think, the better it performed. It wasn't immediately clear how well this strategy would continue to scale up. But now, with the recent release of its successor, O3, the sky seems to be the limit for this new paradigm of scaling LLMs. O3 made headlines when it was announced, as it smashed benchmarks that were previously considered far out of reach for AI. From software engineering to math to PhD-level science questions, O3 easily surpasses the old, state-of-the-art results. O3 isn't just a small improvement on its predecessors. It's a huge leap. And OpenAI researchers say they have every reason to believe this trajectory will continue. It may even be on a path to artificial general intelligence. Instead of continuing to scale up the model size during training, it seems likely that researchers will shift focus to scaling the amount of compute available to the model for its chain of thought, also called test-time compute. By letting models think for longer, LLMs like O1 and O3 can leverage more compute on the fly, scaling up their intelligence when it's needed for harder and harder problems. Scaling pre-training may have plateaued. But by training test-time compute, OpenAI may have just opened up an entirely new paradigm for scaling laws, potentially unlocking capabilities we never thought possible. Large-language models are a key piece of the hunt to artificial general intelligence. These same principles of scaling appear to hold for other models too. Image diffusion models, protein folding, and chemical models. Even world models for robotics. Like for self-driving. One thing is clear. It might be mid-game for large-language models, but we are clearly still in the early game for scaling other modalities. Buckle up."
text_structurer = TwoStepTextStructurer()
structured_text = text_structurer.run_two_step_structure(text)
text_structurer.save_markdown("test.md", structured_text)

Extracted Sections: [{'title': 'YC Spring Batch Application', 'description': 'Information about the application deadline for the YC Spring Batch, the benefits of being accepted, and a call to action to apply.'}, {'title': 'Scaling of Large Language Models', 'description': "Discussion on the trend of increasing size and intelligence of large language models (LLMs), the strategy of scaling, and the comparison to Moore's law."}, {'title': 'GPT-2 and GPT-3 Releases', 'description': 'Overview of the release of GPT-2 and its successor GPT-3 by OpenAI, highlighting the significant increase in size and capabilities.'}, {'title': 'Scaling Laws for Neural Language Models', 'description': 'Introduction to the influential paper on scaling laws for neural language models by Jared Kaplan, Sam McCandlish, and colleagues, and its impact on the field.'}, {'title': 'Ingredients for Training AI Models', 'description': "Explanation of the three main ingredients for training AI models: the model itself, th

In [47]:
from typing import Dict
import re

class MarkdownParser:
    def __init__(self, max_header_level: int = 2):
        """
        Args:
            max_header_level: 별도 섹션으로 분리할 최대 헤더 레벨 (기본값: 2, ## 까지만 분리)
        """
        self.max_header_level = max_header_level
    

    def parse_markdown(self, markdown_text: str) -> Dict[str, str]:
        """마크다운 텍스트를 파싱하여 섹션별로 분리"""
        sections = {}
        current_content = []
        current_title = None
        
        for line in markdown_text.split('\n'):
            if line.startswith('#') and ' ' in line:
                # 헤더 레벨 확인
                level = len(line) - len(line.lstrip('#'))
                
                # max_header_level 이하의 헤더만 새로운 섹션으로 처리
                if level <= self.max_header_level:
                    # 이전 섹션 저장
                    if current_title and current_content:
                        sections[current_title] = '\n'.join(current_content).strip()
                    
                    # 새로운 섹션 시작
                    current_title = line.lstrip('#').strip()
                    current_content = []
                else:
                    # 상위 레벨 헤더는 내용에 포함
                    current_content.append(line)
            else:
                if current_title is None:
                    continue  # 첫 헤더 이전의 내용은 무시
                current_content.append(line)
        
        # 마지막 섹션 저장
        if current_title and current_content:
            sections[current_title] = '\n'.join(current_content).strip()
        
        return sections
    
    def save_markdown(self, file_name: str, markdown_text: str, output_dir: str = "markdown_input") -> None:
        """
        마크다운 텍스트를 파일로 저장
        """
        with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as f:
            f.write(markdown_text)
        print(f"Markdown saved to: {os.path.join(output_dir, file_name)}")


In [43]:
markdown_parser = MarkdownParser()
sections = markdown_parser.parse_markdown(structured_text)

for title, content in sections.items():
    print(f"Title: {title}\n")
    print(f"Content: {content}\n")
    print("-" * 80)


Title: YC Spring Batch Application

Content: ### Application Deadline
- **The deadline to apply for the first YC Spring Batch is February 11th.**

### Benefits of Being Accepted
- If you're accepted, you'll receive **$500,000 in investment**.
- You'll also gain **access to the best startup community in the world**.

### Call to Action
- **Apply now** and come build the future with us.

---

The rest of the text discusses advancements in AI and scaling laws, which are not directly related to the YC Spring Batch Application section. Therefore, they have been excluded from this markdown structure.

--------------------------------------------------------------------------------
Title: Scaling of Large Language Models

Content: ### The Trend of Increasing Size and Intelligence
- **Large language models (LLMs) are getting bigger and smarter.** Over the past few years, AI labs have adopted a winning strategy: **scaling**. This involves increasing the number of parameters, the amount of data,

In [44]:
import time
from tqdm import tqdm

class TextExplainer:
    def __init__(self):
        self.conversation_history = []
        self.delay = 1
        
    def explain_section(self, section_title: str, section_content: str, is_first: bool = False) -> str:
        """
        섹션의 내용을 설명하는 함수
        
        Args:
            section_title: 섹션 제목
            section_content: 섹션 내용
            is_first: 첫 번째 섹션인지 여부
            
        Returns:
            str: 섹션에 대한 설명
        """
        if is_first:
            prompt = f"""다음 섹션 '{section_title}'의 내용을 명확하고 자세하게 설명해주세요.
            
            섹션 내용:
            {section_content}"""
        else:
            prompt = f"""이전 설명을 바탕으로, 다음 섹션 '{section_title}'의 내용을 설명해주세요.
            
            섹션 내용:
            {section_content}"""
        
        try:
            # 이전 대화 내용을 포함하여 컨텍스트 유지
            self.conversation_history.append({"role": "user", "content": prompt})
            
            response = get_completion(self.conversation_history, model_name="deepseek-reasoner")
            
            # 대화 히스토리 업데이트
            self.conversation_history.append({"role": "assistant", "content": response})
            
            # API 호출 간 딜레이
            time.sleep(self.delay)
            
            return response
            
        except Exception as e:
            print(f"Error explaining section {section_title}: {str(e)}")
            return f"Error: Failed to explain section {section_title}"

    def explain_text(self, sections: Dict[str, str]) -> Dict[str, str]:
        """
        텍스트의 각 섹션을 순차적으로 설명
        
        Args:
            sections: 섹션 제목과 내용을 매핑한 딕셔너리
            
        Returns:
            섹션 제목과 설명을 매핑한 딕셔너리
        """
        explanations = {}
        
        print("\nProcessing sections:")
        for i, (title, content) in tqdm(enumerate(sections.items()), desc="Explaining sections"):
            print(f"\nProcessing: {title}")
            explanation = self.explain_section(title, content, is_first=(i==0))
            explanations[title] = explanation
            
        return explanations
    
    def get_conversation_history(self):
        """대화 히스토리 반환"""
        return self.conversation_history
    
    def save_explanations(self, explanations: Dict[str, str], file_name: str, output_dir: str = "explanation_output") -> str:
        """설명을 파일로 저장"""
        with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as f:
            for title, explanation in explanations.items():
                f.write(f"## {title}\n\n{explanation}")
                f.write("\n\n---\n")
        print(f"Explanations saved to: {os.path.join(output_dir, file_name)}")
        
        return os.path.join(output_dir, file_name)

In [45]:
from typing import Optional, List, Dict
import time

class TextQA:
    def __init__(self, context: Optional[List[Dict[str, str]]] = None):
        self.conversation_history = context or []
        self.delay = 1
        
    def ask_question(self, question: str) -> str:
        """텍스트에 대한 질문에 답변"""
        try:
            prompt = f"""Based on the text we discussed, please answer the following question in Korean. 
            Be specific and cite relevant sections when possible.

            Question: {question}"""
            
            self.conversation_history.append({"role": "user", "content": prompt})
            response = get_completion(self.conversation_history)
            self.conversation_history.append({"role": "assistant", "content": response})
            
            time.sleep(self.delay)
            return response
            
        except Exception as e:
            print(f"Error processing question: {str(e)}")
            return f"Error: Failed to process question"
    
    def view_conversation_history(self, start_idx: int = 0, end_idx: Optional[int] = None) -> None:
        """대화 내역을 출력
        
        Args:
            start_idx: 시작 인덱스 (기본값: 0)
            end_idx: 종료 인덱스 (기본값: None, None일 경우 끝까지 출력)
        """
        conversations = [
            msg for msg in self.conversation_history 
            if not msg["content"].startswith("Here's the text summary:")
        ]
        
        end_idx = end_idx if end_idx is not None else len(conversations)
        
        print("\n=== 대화 내역 ===\n")
        for i, msg in enumerate(conversations[start_idx:end_idx], start=start_idx):
            role = msg["role"].upper()
            if role == "ASSISTANT":
                print(f"\n🤖 Assistant ({i}):\n{msg['content']}\n")
                print("-" * 80)
            elif role == "USER":
                print(f"\n👤 User ({i}):\n{msg['content']}\n")
                print("-" * 80)
    
    def get_last_n_conversations(self, n: int = 1) -> None:
        """최근 n개의 대화 내역을 출력
        
        Args:
            n: 출력할 최근 대화 개수 (기본값: 1)
        """
        conversations = [
            msg for msg in self.conversation_history 
            if not msg["content"].startswith("Here's the text summary:")
        ]
        start_idx = max(0, len(conversations) - n)
        self.view_conversation_history(start_idx)
        
    def get_conversation_history(self) -> List[Dict[str, str]]:
        """전체 대화 기록 반환"""
        return self.conversation_history

In [46]:
from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
from rich.syntax import Syntax
from rich.table import Table
from typing import Dict
import os

class MarkdownPrinter:
    def __init__(self):
        self.console = Console()
        
    def print_markdown_file(self, file_path: str):
        """마크다운 파일을 이쁘게 출력"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                markdown_content = f.read()
            
            # 마크다운 렌더링
            md = Markdown(markdown_content)
            
            # 마크다운 내용 출력
            self.console.print(md)
            
        except Exception as e:
            self.console.print(f"[bold red]Error reading file: {str(e)}[/]")
            
    def print_sections(self, sections: Dict[str, str]):
        """섹션별로 구분하여 출력"""
        for section, content in sections.items():
            # 섹션 제목
            self.console.print("\n")
            self.console.print(Panel(
                f"[bold cyan]{section}[/]",
                border_style="cyan"
            ))
            
            # 섹션 내용
            md = Markdown(content)
            self.console.print(md)
            
            # 구분선
            self.console.print("[dim]" + "="*80 + "[/]")

In [67]:

def process_text(youtube_url: str) -> str:
    file_path = download_audio(youtube_url)
    file_name = extract_file_name(file_path)
    
    audio_transcriber = AudioTranscriber()
    text = audio_transcriber.transcribe_audio(file_path)
    audio_transcriber.save_text(text, file_name)
    
    if (len(text) > 1000):
        text_structurer = TwoStepTextStructurer()
        structured_text = text_structurer.run_two_step_structure(text)
        text_structurer.save_markdown(file_name, structured_text)
    else: 
        text_structurer = TextStructurer()
        structured_text = text_structurer.run_text_structure(text)
        text_structurer.save_markdown(file_name, structured_text)
    
    markdown_parser = MarkdownParser()
    sections = markdown_parser.parse_markdown(structured_text)
    markdown_parser.save_markdown(file_name, structured_text)
    
    explainer = TextExplainer()
    explanations = explainer.explain_text(sections)
    explanation_path = explainer.save_explanations(explanations, file_name)
    
    qa = TextQA(context=explainer.get_conversation_history())
    return explanation_path, qa

def extract_file_name(file_path: str) -> str:
    """
    파일 경로에서 확장자와 디렉토리 경로를 제거하고 파일 이름만 반환
    
    Args:
        file_path: 파일의 전체 경로
        
    Returns:
        str: 확장자와 경로가 제거된 파일 이름
        
    Example:
        "audio/My Video Title-abc123.mp3" -> "My Video Title-abc123"
    """
    # 파일 이름과 확장자 분리 (디렉토리 경로 제거)
    base_name = os.path.basename(file_path)
    # 확장자 제거
    file_name = os.path.splitext(base_name)[0]
    return file_name

In [71]:
explanation_path, qa = process_text("https://www.youtube.com/watch?v=z0wt2pe_LZM")

[youtube] Extracting URL: https://www.youtube.com/watch?v=z0wt2pe_LZM
[youtube] z0wt2pe_LZM: Downloading webpage
[youtube] z0wt2pe_LZM: Downloading tv player API JSON
[youtube] z0wt2pe_LZM: Downloading ios player API JSON
[youtube] z0wt2pe_LZM: Downloading player 0f7c1eff
[youtube] z0wt2pe_LZM: Downloading m3u8 information
[info] z0wt2pe_LZM: Downloading 1 format(s): 251
[download] Destination: audio/2024： The Year the GPT Wrapper Myth Proved Wrong-z0wt2pe_LZM.webm
[download] 100% of   33.03MiB in 00:00:44 at 759.44KiB/s   
[ExtractAudio] Destination: audio/2024： The Year the GPT Wrapper Myth Proved Wrong-z0wt2pe_LZM.mp3
Deleting original file audio/2024： The Year the GPT Wrapper Myth Proved Wrong-z0wt2pe_LZM.webm (pass -k to keep)
Processing chunk 1/8...
Processing chunk 2/8...
Processing chunk 3/8...
Processing chunk 4/8...
Processing chunk 5/8...
Processing chunk 6/8...
Processing chunk 7/8...
Processing chunk 8/8...
Error during section structuring: Expecting value: line 1 column 1

Explaining sections: 0it [00:00, ?it/s]

Explanations saved to: explanation_output/2024： The Year the GPT Wrapper Myth Proved Wrong-z0wt2pe_LZM





In [55]:
printer = MarkdownPrinter()

# 마크다운 파일 출력
printer.print_markdown_file(explanation_path)

In [57]:
question_list = [
    """
    NVIDIA 의 NIM 모델 같은 경우에는 데이터 생성에 최적화된 모델이야? 소셜 미디어 컨텐츠 생성 같은 것도 잘할까? 
    """
]

response = qa.ask_question(question_list[0])

print(response)

NVIDIA의 **NIM(Neural Inference Module)** 모델은 데이터 생성에 최적화된 모델이지만, 그 활용 범위는 단순한 데이터 생성뿐만 아니라 다양한 도메인에 걸쳐 있습니다. 소셜 미디어 콘텐츠 생성과 같은 작업에도 적합하지만, 이는 사용자가 제공하는 프롬프트와 훈련 데이터에 크게 의존합니다.  

---

### **1. NIM 모델의 데이터 생성 최적화**  

#### **가. 데이터 생성에 최적화된 이유**  
- **고성능 추론**:  
  - NIM은 NVIDIA의 최적화된 하드웨어(예: H100 GPU)와 소프트웨어 스택(예: TensorRT)을 활용해 초고속 추론을 지원합니다.  
  - 분당 120만 토큰 이상의 처리 속도로 대규모 데이터 생성 가능.  
- **다양한 데이터 유형 지원**:  
  - 질문-답변 쌍, 코드 프롬프트, 대화 데이터 등 다양한 유형의 데이터 생성 가능.  
  - 예: "기후 변화의 영향"에 대한 100만 건의 질문-답변 쌍 생성.  

#### **나. 소셜 미디어 콘텐츠 생성 가능성**  
- **적합성**:  
  - NIM은 창의적 텍스트 생성(예: 블로그 포스트, 트윗, 광고 카피)에 적합한 구조를 가지고 있습니다.  
  - 예: "새로운 스마트폰 출시" 관련 소셜 미디어 콘텐츠 생성.  
- **품질 보장**:  
  - 보상 모델과 LLM Judge를 통해 생성된 콘텐츠의 품질을 검증.  
  - 예: 트윗의 창의성과 공감적 표현을 LLM Judge가 평가.  

---

### **2. 소셜 미디어 콘텐츠 생성의 성공 조건**  

#### **가. 프롬프트 설계**  
- **구체적 프롬프트**:  
  - "Z세대를 타겟으로 한 새로운 에너지 드링크 광고 카피 생성"과 같이 명확한 지시 필요.  
- **맥락 제공**:  
  - 브랜드 톤앤매너, 타겟 고객 특성 등 추가 정보 제공.  

#### **나. 훈련 데이터 품질**  
- **도메인 특화 데이터**:  
  - 