In [1]:
from pathlib import Path

from tqdm.auto import tqdm
from minsearch import Index

import docs

In [2]:
# Preparing the data
# Index the data


# Preparing the function
# Setting up the search function

In [3]:
documents = []

data_folder = Path('../data_cache/youtube_videos/')

for f in tqdm(data_folder.glob('*.txt')):
    filename = f.name
    video_id, _ = filename.split('.')

    transcript = f.read_text(encoding='utf-8')

    chunks = docs.sliding_window(transcript, size=3000, step=1500)

    for chunk in chunks:
        chunk['video_id'] = video_id
        documents.append(chunk)

0it [00:00, ?it/s]

In [4]:
index = Index(
    text_fields=['content'],
    keyword_fields=['video_id']
)

index.fit(documents)

<minsearch.minsearch.Index at 0x105549ca0>

In [5]:
from typing import Any, Dict, List, TypedDict

class SearchResult(TypedDict):
    """Represents a single search result entry."""
    start: int
    content: str
    video_id: str


def search(query: str) -> List[SearchResult]:
    """
    Search the index for documents matching the given query.

    Args:
        query (str): The search query string.

    Returns:
        List[SearchResult]: A list of search results. Each result dictionary contains:
            - start (int): The starting position or offset within the source file.
            - content (str): A text excerpt or snippet containing the match.
            - video_id (str): YouTube video ID for the snippet.
    """
    return index.search(
        query=query,
        num_results=5,
    )

In [6]:
from pydantic_ai.messages import FunctionToolCallEvent

async def print_function_calls(ctx, event):
    # Detect nested streams
    if hasattr(event, "__aiter__"):
        async for sub in event:
            await print_function_calls(ctx, sub)
        return

    if isinstance(event, FunctionToolCallEvent):
        print("TOOL CALL:", event.part.tool_name, event.part.args)

### Agent

In [7]:
from pydantic_ai import Agent
from pydantic import BaseModel, Field
import agents
from dotenv import load_dotenv

load_dotenv()



True

In [8]:
from typing import List


class Reference(BaseModel):
    """
    Represents a specific citation to a YouTube video segment used as evidence or context in a paragraph.

    Each reference links to a precise moment in a video where the cited idea or discussion occurs.
    """
    video_id: str = Field(..., description="The unique YouTube video ID (e.g., 'rwuud5wr3J4').")
    timestamp: str = Field(..., description="The timestamp in the format 'mm:ss' or 'h:mm:ss' pointing to the relevant segment.")
    quote: str = Field(..., description="A short excerpt or paraphrase from the referenced segment of the video.")


class Keyword(BaseModel):
    """Research results for a specific keyword"""
    search_keyword: str = Field(..., description="Exact keyword used for search.")
    summary: str = Field(..., description="Short summary of the search result.")
    references: List[Reference] = Field(..., description="Specific references to help us track the findings of the research.")
    relevance_summary: str = Field(..., description="1 sentence for each reference explainig how it supports the keyword's summary — ensure factual consistency.")
    other_ideas: str = Field(..., description="Free-form description of related or complimentary ideas to explore in next stages.")


class StageReport(BaseModel):
    """
    Represents the output of a single stage in the multi-stage research process.

    Each stage includes its numeric order, a list of relevant keywords or queries used,
    and a textual summary describing the key insights or outcomes from that stage.
    """
    stage: int = Field(..., description="The stage number (e.g., 1, 2, or 3) representing the research depth.")
    keywords: List[Keyword] = Field(..., description="List of exact search queries used during this stage.")
    summary: str = Field(..., description="A concise summary of the main findings or insights from this stage.")


class Paragraph(BaseModel):
    """
    Represents a single paragraph within an article section.

    Each paragraph explains one specific idea or subtopic, written in 3–5 sentences,
    and must include at least one reference to a relevant YouTube video segment.
    """
    content: str = Field(
        ...,
        description="A paragraph of 3–5 sentences elaborating on one key idea within the section."
    )
    references: List[Reference] = Field(
        ...,
        description="A list of one or more references to YouTube videos supporting or illustrating the paragraph."
    )


class ArticleSection(BaseModel):
    """
    Represents a single section of the final research article.

    Each section has a descriptive title and a list of paragraphs,
    each paragraph supported by at least one YouTube-based reference.
    """
    title: str = Field(..., description="The title or heading of the section.")
    paragraphs: List[Paragraph] = Field(
        ...,
        description="A list of paragraphs forming the body of this section, each containing content and references."
    )


class ActionPoint(BaseModel):
    """Practical takeaways from the research."""
    point: str = Field(..., description="A concrete recommendation, insight, or action derived from the research.")
    relevance_check: str = Field(..., description="Explain how the referenced quote supports this action point — must show logical connection, not assumption.")
    reference: Reference = Field(..., description="Source supporting this action point.")


class Article(BaseModel):
    """
    Represents the complete research article generated from all research stages.

    The article contains an introductory instruction, a set of structured sections,
    and a conclusion summarizing the overall findings.
    """
    title: str = Field(..., description="The title of the article.")
    introduction: str = Field(..., description="The introduction or contextual overview of the article.")
    sections: List[ArticleSection] = Field(..., description="A list of sections comprising the main body of the article.")
    action_points: List[ActionPoint] = Field(..., description="3-5 key insights or recommendations derived from the findings.")
    conclusion: str = Field(..., description="The final concluding text summarizing findings and insights.")


class ResearchReport(BaseModel):
    """
    Represents the full structured output of the research agent.

    It contains:
    - A list of StageReport objects describing each stage of the research process.
    - An Article object representing the final synthesized report.
    """
    stages: List[StageReport] = Field(..., description="A list of stage summaries capturing the evolution of the research process.")
    article: Article = Field(..., description="The final research article synthesizing all insights from the stages.")

In [9]:
instructions = """
You are an autonomous research agent. Your goal is to perform deep, multi-stage research on the given topic using the available search function. You must iteratively refine your understanding of the topic and its subtopics through structured exploration.

Research process:

stage 1: initial exploration  
- Perform one broad search query to understand the main topic and identify related areas.  
- Summarize key concepts, definitions, and major themes.  

stage 2: broad expansion  
- Perform 5–6 targeted queries based on findings from stage 1.  
- Explore adjacent and contextual topics to build a broader understanding.  
- Identify key debates, challenges, frameworks, and major contributors.  

stage 3: deep investigation  
- Perform 5–6 refined queries focusing on depth.  
- Investigate specific mechanisms, case studies, technical details, or research gaps.  
- Gather diverse viewpoints and data to strengthen depth and accuracy.  

Final deliverable:

Produce a complete research report as valid JSON that fits the ResearchReport schema.  
The article must be long, detailed, and divided into multiple sections and paragraphs — not short summaries.
"""

In [10]:
agent = Agent(
    name='research',
    instructions=instructions,
    tools=[search],
    model='gpt-4o-mini',
    output_type=ResearchReport
)

In [11]:
results = await agent.run(
    user_prompt='how do I make money with AI',
    event_stream_handler=print_function_calls    
)

TOOL CALL: search {"query":"how to make money with AI"}
TOOL CALL: search {"query": "AI monetization strategies"}
TOOL CALL: search {"query": "AI business models"}
TOOL CALL: search {"query": "how startups make money with AI"}
TOOL CALL: search {"query": "AI consulting business"}
TOOL CALL: search {"query": "using AI for marketing and sales"}
TOOL CALL: search {"query": "AI investment opportunities"}
TOOL CALL: search {"query": "AI startups success stories"}
TOOL CALL: search {"query": "AI tools and platforms for business"}
TOOL CALL: search {"query": "AI in e-commerce making money"}
TOOL CALL: search {"query": "AI-based services profitability"}


In [12]:
report = results.output

In [13]:
report

ResearchReport(stages=[StageReport(stage=1, keywords=[Keyword(search_keyword='how to make money with AI', summary='People explore various avenues to monetize AI, including creating AI products, consulting, and leveraging AI for business improvements.', references=[Reference(video_id='pkcpH5N-GP8', timestamp='10500', quote="Sometimes that the body works in a funny way. it's compensating."), Reference(video_id='pkcpH5N-GP8', timestamp='9000', quote='if you make some money, you take it away from someone else in that market.'), Reference(video_id='pkcpH5N-GP8', timestamp='19500', quote='Look at it from a staffing perspective these engineers are extremely expensive.')], relevance_summary='The references discuss the economic context in which AI operates and how startups attempt to monetize AI, dealing with both competition and market dynamics.', other_ideas='Investigate specific AI applications in various industries.'), Keyword(search_keyword='AI monetization strategies', summary='Monetizati

In [14]:
for stage in report.stages:
    print(stage)
    print()

stage=1 keywords=[Keyword(search_keyword='how to make money with AI', summary='People explore various avenues to monetize AI, including creating AI products, consulting, and leveraging AI for business improvements.', references=[Reference(video_id='pkcpH5N-GP8', timestamp='10500', quote="Sometimes that the body works in a funny way. it's compensating."), Reference(video_id='pkcpH5N-GP8', timestamp='9000', quote='if you make some money, you take it away from someone else in that market.'), Reference(video_id='pkcpH5N-GP8', timestamp='19500', quote='Look at it from a staffing perspective these engineers are extremely expensive.')], relevance_summary='The references discuss the economic context in which AI operates and how startups attempt to monetize AI, dealing with both competition and market dynamics.', other_ideas='Investigate specific AI applications in various industries.'), Keyword(search_keyword='AI monetization strategies', summary='Monetization strategies include productization

In [15]:
article = report.article

In [16]:
print('## Introduction')
print()

print(article.introduction)
print()

for section in article.sections:
    print('##', section.title)
    print()
    for p in section.paragraphs:
        print('## paragraph')
        print(p)
        print()

print('## Conclusions')
print()
print(article.conclusion)

## Introduction

Artificial intelligence has opened up a plethora of opportunities for generating revenue across various industries. From startups launching innovative AI-driven solutions to established companies integrating AI into their operations for enhanced efficiency, the ways to monetize AI are vast and varied. This report explores these strategies, business models, and the challenges faced while generating income through AI technologies.

## Understanding AI Monetization Strategies

## paragraph
content='Monetizing artificial intelligence effectively requires businesses to adopt strategies that capitalize on their technological capabilities and market needs. Initially, companies focused on creating products based on AI technologies or directly offering AI as a service to customers. However, the landscape has evolved to include myriad monetization avenues such as subscription models, consulting services, and custom AI solutions. These allow organizations to harness research and 

### Final Agent

In [None]:
# instruction
# Agent 
# Run it

In [17]:
instructions = """
You are a deep research agent exploring topics using a proprietary podcast/video database.

Given a user question, perform a structured, multi-stage exploration to understand
the topic deeply and comprehensively through the database.

## DATA SOURCE

- You can only use the results from the `search()` function.
- Each search result includes `video_id` and snippet text.
- All references must link to YouTube URLs derived from the database and contain a quote
- Do not create, infer, or guess podcast names, titles, or timestamps.

## PROCESS

Stage 1 — Initial Search

1. Use the user's question as the first query with `search()`.
2. Summarize the most relevant insights from the results.
3. Identify key ideas, recurring themes, or related questions.

Stage 2 — Expansion

1. Generate 5-7 follow-up queries that explore related subtopics or complementary ideas.
2. For each query, call `search()` again.
3. Summarize the main insights from each result.

Stage 3 — Deep Dive

1. From the Stage 2 findings, generate 5-7 deeper or contrasting exploration queries.
2. For each, call `search()` again and summarize findings.
3. At the end of Stage 3, write an article that describes everything you discovered.

## Exploration rules

You are not allowed to stop until you perform at least 11 queries:

- 1 initial query for stage 1
- 5-7 follow up queries for stage 2
- 5-7 deeper exploration queries for stage 3

## References

When generating a claim or action point:

- Read the reference quote carefully.
- Write the claim as a faithful paraphrase or inference strictly supported by the quote.
- After each claim, provide a 1–2 sentence "relevance_check" explaining why the quote supports it.
- Do not generalize or introduce new facts not mentioned in the quote.

## Article

- The resulting article should contain an introduction, 5-8 sections and a conclusion.
- Each section should present 3-4 claims (backed by references) grouped by topics
- Each claim should be a paragraph with 3-4 sentences.
"""

In [35]:
from pydantic import BaseModel, Field
from typing import List

class Reference(BaseModel):
    """Citations that directly tie each claim to a verifiable source."""
    quote: str = Field(..., description="A short, verbatim quote (2–4 sentences) from the database snippet.")
    youtube_id: str = Field(..., description="Video ID")
    timestamp: str = Field(..., description="Timestamp to the exact position in the video where the quote is, 'h:mm:ss' or 'mm:ss' format.")

class Keyword(BaseModel):
    """Research results for a specific keyword"""
    search_keyword: str = Field(..., description="Exact keyword used for search.")
    summary: str = Field(..., description="Short summary of the search result.")
    references: List[Reference] = Field(..., description="Specific references to help us track the findings of the research.")
    relevance_summary: str = Field(..., description="1 sentence for each reference explainig how it supports the keyword's summary — ensure factual consistency.")
    other_ideas: str = Field(..., description="Free-form description of related or complimentary ideas to explore in next stages.")

class StageReport(BaseModel):
    """Summarizes what was found during a single exploration stage."""
    stage: int = Field(..., description="Stage number (1 for initial search, 2 for expansion, 3 for deep dive).")
    keywords: List[Keyword] = Field(..., description="Search keywords ")
    summary: str = Field(..., description="A concise synthesis of insights found in this stage, summarizing themes and discoveries from all queries executed in the stage.")

class Claim(BaseModel):
    """A factual statement supported by one specific reference."""
    description: str = Field(..., description=(
        "A short paragraph (3–4 sentences) that paraphrases the meaning of the quote in your own words. "
        "It must stay faithful to the factual content of the quote — no speculation or extrapolation."
    ))
    relevance_check: str = Field(..., description=(
        "1–2 sentences explaining *why* this quote supports the claim — a brief justification to ensure factual grounding."
    ))
    reference: Reference = Field(..., description=(
        "A direct quote that explicitly supports or demonstrates the statement made in 'description'. "
        "The claim should be a paraphrase or interpretation of this quote."
    ))

class ArticleSection(BaseModel):
    """One thematic part of the final article, containing multiple claims."""
    title: str = Field(..., description="A concise section title summarizing the theme.")
    claims: List[Claim] = Field(..., description="3–4 claims that explore different aspects of this section's theme.")

class ActionPoint(BaseModel):
    """Practical takeaways from the research."""
    point: str = Field(..., description="A concrete recommendation, insight, or action derived from the research.")
    relevance_check: str = Field(..., description="Explain how the referenced quote supports this action point — must show logical connection, not assumption.")
    reference: Reference = Field(..., description="Source supporting this action point.")

class Article(BaseModel):
    """The final synthesized output — a structured article summarizing all research stages."""
    title: str = Field(..., description="Compelling headline summarizing the topic and main insight (7-10 words).")
    introduction: str = Field(..., description="A short overview (3-4 paragraphs) explaining what the research explored and why it matters.")
    sections: List[ArticleSection] = Field(..., description="5-8 well-structured sections presenting grouped claims by topic.")
    action_points: List[ActionPoint] = Field(..., description="Optional 3-5 key insights or recommendations derived from the findings.")
    conclusion: str = Field(..., description="Final synthesis paragraph summarizing the broader takeaways and closing thoughts.")

class ResearchReport(BaseModel):
    """The complete record of exploration across all stages, culminating in the final article."""
    stages: List[StageReport] = Field(..., description="Exploration stage reports (Stage 1–3) detailing the search process.")
    article: Article = Field(..., description="The final article.")


# 1. Research Report 

# 2.
# Stages - stageReport - stage, keyword, summary
# Article - Article - title, introduction, sections, actions_points, conclusion - 

# 3.
# Sections  - ArticleSection - title, Claims 
# Action_points - action point - point, relevance check, reference

# 4.
# Claim - description, relevance check, reference

# 5. 
# Reference - quote, timestamp, id

In [36]:

agent_tools = [search]

In [37]:
agent = Agent(
    name='search',
    instructions=instructions,
    tools=agent_tools,
    model='gpt-4o-mini',
    output_type=ResearchReport
)

In [38]:
question = 'how do I make money with AI'

In [39]:
results = await agent.run(
    user_prompt=question,
    event_stream_handler=print_function_calls
)


TOOL CALL: search {"query":"how to make money with AI"}
TOOL CALL: search {"query": "AI business models and monetization strategies"}
TOOL CALL: search {"query": "freelancing opportunities with AI"}
TOOL CALL: search {"query": "AI startups and investment opportunities"}
TOOL CALL: search {"query": "how to generate income with AI tools"}
TOOL CALL: search {"query": "AI for passive income generation"}
TOOL CALL: search {"query": "AI freelancing platforms and services"}
TOOL CALL: search {"query": "case studies of successful AI businesses"}
TOOL CALL: search {"query": "generate income through AI consulting"}
TOOL CALL: search {"query": "AI and e-commerce monetization"}
TOOL CALL: search {"query": "successful case studies in AI monetization"}
TOOL CALL: search {"query": "freelancing with AI tools"}
TOOL CALL: search {"query": "AI services and subscription models"}


In [41]:
def to_link(reference) -> str:
    """
    Converts the timestamp to a YouTube URL with a proper time offset.
    Supports both 'h:mm:ss' and 'mm:ss' formats.
    """
    if not reference.timestamp:
        return f"https://www.youtube.com/watch?v={reference.youtube_id}"

    ts = reference.timestamp.strip()
    if not ts:
        return f"https://www.youtube.com/watch?v={reference.youtube_id}"

    parts = ts.split(":")

    try:
        parts = [int(p) for p in parts]
    except ValueError:
        return f"https://www.youtube.com/watch?v={reference.youtube_id}"

    if len(parts) == 3: # h:mm:ss
        hours, minutes, seconds = parts
    elif len(parts) == 2: # mm:ss
        hours, minutes, seconds = 0, parts[0], parts[1]
    elif len(parts) == 1:
        hours, minutes, seconds = 0, 0, parts[0]

    total_seconds = hours * 3600 + minutes * 60 + seconds
    return f"https://www.youtube.com/watch?v={reference.youtube_id}&t={total_seconds}s"

def diplay_reference(reference: Reference): 
    return f"[{reference.quote}]({to_link(reference)})" 

In [42]:
report.article

Article(title='Monetizing AI: Strategies, Opportunities, and Insights for 2023', introduction='In a rapidly evolving landscape, artificial intelligence (AI) stands out as a transformative force redefining the business model of numerous industries. The ability to quickly adapt and effectively monetize AI technology is paramount for companies striving to remain competitive and innovative. This article explores the strategies and opportunities for making money with AI, drawing insights from industry experts and contemporary business practices.', sections=[ArticleSection(title='Understanding AI Monetization Fundamentals', paragraphs=[Paragraph(content='Monetizing AI effectively requires a firm grasp of technology and market needs. Startups often face the pitfall of developing solutions based on technology rather than addressing real-world problems. This lack of market alignment can hinder their success in generating profits. As highlighted in industry conversations, a clear understanding o

In [43]:
report = results.output

# Disyplay stage-by-stage findings
for stage in report.stages:
    print('Stage:', stage.stage)
    for kw in stage.keywords:
        print(' keyword:',  kw.search_keyword)
        print(' summary:',  kw.search_keyword)
        print(' references:',  kw.references)
    print(stage.summary)

# Display the final article
article = report.article
print('#', report.article)
print('## Introduction')
print(article.introduction)

for section in article.sections:
    print('##', section.title)
    for claim in section.claims:
        print(claim.description, '(', diplay_reference(claim.reference), ')')


print('## Action Points')
for action_point in article.action_points:
    print('*', action_point.point, diplay_reference(action_point.reference))
        

print('## Conclusion')
print(article.conclusion)

Stage: 1
 keyword: how to make money with AI
 summary: how to make money with AI
 references: [Reference(quote='"...you have companies saying we need to make more money off of machine learning so that\'s the core driver for monetization and what monetization is teaching companies..."', youtube_id='xCjzA_8S4kI', timestamp='9:29')]
The exploration on making money with AI reveals a trend towards businesses wanting to integrate AI technologies for revenue generation. Companies experience financial pressures and seek strategies to leverage AI effectively.
Stage: 2
 keyword: AI business models and monetization strategies
 summary: AI business models and monetization strategies
 references: [Reference(quote='"...there\'s this one common theme it is revenue... the more they understand how to monetize research and how to productize models the more revenue they see coming out of it..."', youtube_id='xCjzA_8S4kI', timestamp='10:48')]
 keyword: freelancing opportunities with AI
 summary: freelanci