In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "Web STORM"
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ['TAVILY_API_KEY'] = os.getenv('TAVILY_API_KEY')

In [2]:
from langchain_openai import ChatOpenAI

fast_llm = ChatOpenAI(model="gpt-4o-mini")
# Uncomment for a Fireworks model
# fast_llm = ChatFireworks(model="accounts/fireworks/models/firefunction-v1", max_tokens=32_000)
long_context_llm = ChatOpenAI(model="gpt-4o")

#### Generate Initial Outline

In [3]:
from typing import List, Optional
from langchain_core.prompts import ChatPromptTemplate

from pydantic import BaseModel, Field

direct_gen_outline_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a Wikipedia writer. Write an outline for a Wikipedia page about a user-provided topic. Be comprehensive and specific.",
        ),
        ("user", "{topic}"),
    ]
)

class Subsection(BaseModel):
    subsection_title: str = Field(..., title="Title of the subsection")
    description: str = Field(..., title="Content of the subsection")
    
    @property
    def as_str(self) -> str:
        return f"### {self.subsection_title}\n\n{self.description}".strip()
    
class Section(BaseModel):
    section_title: str = Field(..., title="Title of the section")
    description: str = Field(..., title="Content of the section")
    subsections: Optional[List[Subsection]] = Field(
        default=None,
        title="Titles and descriptions for each subsection of the Wikipedia page.",
    )

    
    @property
    def as_str(self) -> str:
        subsections = "\n\n".join(
            f"### {subsection.subsection_title}\n\n{subsection.description}"
            for subsection in self.subsections or []
        )
        return f"## {self.section_title}\n\n{self.description}\n\n{subsections}".strip()
    
class Outline(BaseModel):
    page_title: str = Field(..., title="Title of the Wikipedia page")
    sections: List[Section] = Field(
        default_factory=list,
        title="Titles and descriptions for each section of the Wikipedia page.",
    )

    @property
    def as_str(self) -> str:
        sections = "\n\n".join(section.as_str for section in self.sections)
        return f"# {self.page_title}\n\n{sections}".strip()


generate_outline_direct = direct_gen_outline_prompt | fast_llm.with_structured_output(
    Outline
)

In [4]:
example_topic = "Impact of million-plus token context window language models on RAG"

initial_outline = generate_outline_direct.invoke({"topic": example_topic})

print(initial_outline.as_str)

# Impact of Million-Plus Token Context Window Language Models on RAG

## Introduction

An overview of the topic, explaining what million-plus token context window language models are and their relevance to Retrieval-Augmented Generation (RAG).

## Understanding Language Models

A section that delves into language models, particularly focusing on the architecture and functioning of those with million-plus token context windows.

## Retrieval-Augmented Generation (RAG)

Explanation of RAG, its components, and how it integrates retrieval mechanisms with language generation.

## The Evolution of Context Windows

A historical perspective on the development of context windows in language models, leading up to million-plus token capabilities.

## Advantages of Million-Plus Token Context Windows

A detailed discussion on the benefits of using million-plus token context windows in language models for RAG, including improved comprehension, context retention, and coherence.

## Challenges and Lim

#### Generate Related Topics

In [6]:
gen_related_topics_prompt = ChatPromptTemplate.from_template(
    """I'm writing a Wikipedia page for a topic mentioned below. Please identify and recommend some Wikipedia pages on closely related subjects. I'm looking for examples that provide insights into interesting aspects commonly associated with this topic, or examples that help me understand the typical content and structure included in Wikipedia pages for similar topics.

Please list the as many subjects and urls as you can.

Topic of interest: {topic}
"""
)


class RelatedSubjects(BaseModel):
    topics: List[str] = Field(
        description="Comprehensive list of related subjects as background research.",
    )


expand_chain = gen_related_topics_prompt | fast_llm.with_structured_output(
    RelatedSubjects
)

In [7]:
related_subjects = await expand_chain.ainvoke({"topic": example_topic})
related_subjects

RelatedSubjects(topics=['Language models', 'Context window', 'Retrieval-Augmented Generation (RAG)', 'Natural Language Processing (NLP)', 'Artificial Intelligence (AI)', 'Machine Learning (ML)', 'Transformers (machine learning)', 'Tokenization in NLP', 'Applications of language models', 'Large-scale language models', "OpenAI's GPT-3", 'BERT (language model)', 'T5 (Text-To-Text Transfer Transformer)', 'Impact of AI on language understanding', 'Future of language models'])

#### Generate Perspectives

In [15]:
class Editor(BaseModel):
    affiliation: str = Field(
        description="Primary affiliation of the editor.",
    )
    name: str = Field(
        description="Name of the editor.", pattern=r"^[a-zA-Z0-9_-]{1,64}$"
    )
    role: str = Field(
        description="Role of the editor in the context of the topic.",
    )
    description: str = Field(
        description="Description of the editor's focus, concerns, and motives.",
    )

    @property
    def persona(self) -> str:
        return f"Name: {self.name}\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.description}\n"


class Perspectives(BaseModel):
    editors: List[Editor] = Field(
        description="Comprehensive list of editors with their roles and affiliations.",
        # Add a pydantic validation/restriction to be at most M editors
    )


gen_perspectives_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You need to select a diverse (and distinct) group of Wikipedia editors who will work together to create a comprehensive article on the topic. Each of them represents a different perspective, role, or affiliation related to this topic.\
    You can use other Wikipedia pages of related topics for inspiration. For each editor, add a description of what they will focus on.

    Wiki page outlines of related topics for inspiration:
    {examples}""",
        ),
        ("user", "Topic of interest: {topic}"),
    ]
)

gen_perspectives_chain = gen_perspectives_prompt | ChatOpenAI(
    model="gpt-4o-mini"
).with_structured_output(Perspectives)

In [16]:
from langchain_community.retrievers import WikipediaRetriever
from langchain_core.runnables import RunnableLambda
from langchain_core.runnables import chain as as_runnable

wikipedia_retriever = WikipediaRetriever(load_all_available_meta=True, top_k_results=1)


def format_doc(doc, max_length=1000):
    related = "- ".join(doc.metadata["categories"])
    return f"### {doc.metadata['title']}\n\nSummary: {doc.page_content}\n\nRelated\n{related}"[
        :max_length
    ]


def format_docs(docs):
    return "\n\n".join(format_doc(doc) for doc in docs)


@as_runnable
async def survey_subjects(topic: str):
    related_subjects = await expand_chain.ainvoke({"topic": topic})
    retrieved_docs = await wikipedia_retriever.abatch(
        related_subjects.topics, return_exceptions=True
    )
    all_docs = []
    for docs in retrieved_docs:
        if isinstance(docs, BaseException):
            continue
        all_docs.extend(docs)
    formatted = format_docs(all_docs)
    return await gen_perspectives_chain.ainvoke({"examples": formatted, "topic": topic})

In [17]:
perspectives = await survey_subjects.ainvoke(example_topic)

ValidationError: 7 validation errors for Perspectives
editors.0.name
  String should match pattern '^[a-zA-Z0-9_-]{1,64}$' [type=string_pattern_mismatch, input_value='Dr. Emily Chen', input_type=str]
    For further information visit https://errors.pydantic.dev/2.8/v/string_pattern_mismatch
editors.1.name
  String should match pattern '^[a-zA-Z0-9_-]{1,64}$' [type=string_pattern_mismatch, input_value='Prof. Mark Thompson', input_type=str]
    For further information visit https://errors.pydantic.dev/2.8/v/string_pattern_mismatch
editors.2.name
  String should match pattern '^[a-zA-Z0-9_-]{1,64}$' [type=string_pattern_mismatch, input_value='Dr. Lisa Nguyen', input_type=str]
    For further information visit https://errors.pydantic.dev/2.8/v/string_pattern_mismatch
editors.3.name
  String should match pattern '^[a-zA-Z0-9_-]{1,64}$' [type=string_pattern_mismatch, input_value='Dr. Rahul Patel', input_type=str]
    For further information visit https://errors.pydantic.dev/2.8/v/string_pattern_mismatch
editors.4.name
  String should match pattern '^[a-zA-Z0-9_-]{1,64}$' [type=string_pattern_mismatch, input_value='Sara Jones', input_type=str]
    For further information visit https://errors.pydantic.dev/2.8/v/string_pattern_mismatch
editors.5.name
  String should match pattern '^[a-zA-Z0-9_-]{1,64}$' [type=string_pattern_mismatch, input_value='Dr. Andrew Kim', input_type=str]
    For further information visit https://errors.pydantic.dev/2.8/v/string_pattern_mismatch
editors.6.name
  String should match pattern '^[a-zA-Z0-9_-]{1,64}$' [type=string_pattern_mismatch, input_value='Prof. Sarah Mitchell', input_type=str]
    For further information visit https://errors.pydantic.dev/2.8/v/string_pattern_mismatch