In [4]:
from llama_index.core.memory import ChatSummaryMemoryBuffer
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.llms.ollama import Ollama
from transformers import AutoTokenizer
#from llama_index.llms.openai import OpenAI as OpenAiLlm
#import tiktoken

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chat_history = [
    ChatMessage(role="user", content="What is LlamaIndex?"),
    ChatMessage(
        role="assistant",
        content="LlamaaIndex is the leading data framework for building LLM applications",
    ),
    ChatMessage(role="user", content="Can you give me some more details?"),
    ChatMessage(
        role="assistant",
        content="""LlamaIndex is a framework for building context-augmented LLM applications. Context augmentation refers to any use case that applies LLMs on top of your private or domain-specific data. Some popular use cases include the following: 
        Question-Answering Chatbots (commonly referred to as RAG systems, which stands for "Retrieval-Augmented Generation"), Document Understanding and Extraction, Autonomous Agents that can perform research and take actions
        LlamaIndex provides the tools to build any of these above use cases from prototype to production. The tools allow you to both ingest/process this data and implement complex query workflows combining data access with LLM prompting.""",
    ),
]

In [5]:
#model = "qwen3:14b"
model = 'qwen3:14b'
tokenizer_model = "Qwen/Qwen3-14B"
context_window = 1000
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

llm = Ollama(
    model=model,
    request_timeout=120.0,
    thinking=False,
    context_window=context_window,
)

In [6]:
summarizer_llm = llm
#tokenizer_fn = tiktoken.encoding_for_model(model).encode
tokenizer_fn = lambda text: tokenizer.encode(text, add_special_tokens=False)

memory = ChatSummaryMemoryBuffer.from_defaults(
    chat_history=chat_history,
    llm=summarizer_llm,
    token_limit=2,
    tokenizer_fn=tokenizer_fn,
)

history = memory.get()

In [10]:
history

[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='The conversation discusses LlamaIndex, a framework for building context-augmented LLM applications. It explains that LlamaIndex enables the use of large language models (LLMs) with private or domain-specific data, supporting use cases like question-answering chatbots (RAG systems), document understanding, and autonomous agents. The framework provides tools for data ingestion, processing, and complex query workflows that combine data access with LLM prompting.')])]

In [11]:
dir(memory)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_computed_fields__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_setattr_handl

In [12]:
memory.summarize_prompt

'The following is a conversation between the user and assistant. Write a concise summary about the contents of this conversation.'

In [13]:
new_chat_history = [
    ChatMessage(role="user", content="Why context augmentation?"),
    ChatMessage(
        role="assistant",
        content="LLMs offer a natural language interface between humans and data. Widely available models come pre-trained on huge amounts of publicly available data. However, they are not trained on your data, which may be private or specific to the problem you're trying to solve. It's behind APIs, in SQL databases, or trapped in PDFs and slide decks. LlamaIndex provides tooling to enable context augmentation. A popular example is Retrieval-Augmented Generation (RAG) which combines context with LLMs at inference time. Another is finetuning.",
    ),
    ChatMessage(role="user", content="Who is LlamaIndex for?"),
    ChatMessage(
        role="assistant",
        content="LlamaIndex provides tools for beginners, advanced users, and everyone in between. Our high-level API allows beginner users to use LlamaIndex to ingest and query their data in 5 lines of code. For more complex applications, our lower-level APIs allow advanced users to customize and extend any module—data connectors, indices, retrievers, query engines, reranking modules—to fit their needs.",
    ),
]
memory.put(new_chat_history[0])
memory.put(new_chat_history[1])
memory.put(new_chat_history[2])
memory.put(new_chat_history[3])
history = memory.get()

In [14]:
history

[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='The conversation discusses LlamaIndex, a framework that enables the integration of large language models (LLMs) with private or domain-specific data through context augmentation. It explains the importance of context augmentation to address the limitations of pre-trained LLMs, which are not trained on user-specific data. The framework supports various use cases, such as RAG systems and autonomous agents, and provides tools for data ingestion, processing, and querying. LlamaIndex is designed for users of all skill levels, offering both high-level and low-level APIs to accommodate different needs.')])]

In [17]:
memory = ChatSummaryMemoryBuffer.from_defaults(
    chat_history=chat_history + new_chat_history,
    llm=summarizer_llm,
    token_limit=256,
    tokenizer_fn=tokenizer_fn,
)
print(memory.get())

[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='The conversation discusses LlamaIndex, a framework for building context-augmented LLM applications. It explains that LlamaIndex enables the use of large language models (LLMs) in conjunction with private or domain-specific data, with use cases such as question-answering chatbots (RAG systems), document understanding, and autonomous agents. The framework provides tools for data ingestion, processing, and implementing complex query workflows that combine data access with LLM prompting.')]), ChatMessage(role=<MessageRole.USER: 'user'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='Why context augmentation?')]), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text="LLMs offer a natural language interface between humans and data. Widely available models come pre-trained on huge amounts of publi

In [20]:
len(memory.get())

5

In [22]:
for i in memory.get():
    print(i)
    print(5*'=')

system: The conversation discusses LlamaIndex, a framework for building context-augmented LLM applications. It explains that LlamaIndex enables the use of large language models (LLMs) in conjunction with private or domain-specific data, with use cases such as question-answering chatbots (RAG systems), document understanding, and autonomous agents. The framework provides tools for data ingestion, processing, and implementing complex query workflows that combine data access with LLM prompting.
=====
user: Why context augmentation?
=====
assistant: LLMs offer a natural language interface between humans and data. Widely available models come pre-trained on huge amounts of publicly available data. However, they are not trained on your data, which may be private or specific to the problem you're trying to solve. It's behind APIs, in SQL databases, or trapped in PDFs and slide decks. LlamaIndex provides tooling to enable context augmentation. A popular example is Retrieval-Augmented Generatio