https://docs.llamaindex.ai/en/stable/examples/memory/ChatSummaryMemoryBuffer/

In [8]:
from llama_index.core.memory import ChatSummaryMemoryBuffer
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.llms.ollama import Ollama
import tiktoken

In [9]:
import llama_index.core.memory  as m
dir(m)

['BaseMemory',
 'BaseMemoryBlock',
 'ChatMemoryBuffer',
 'ChatSummaryMemoryBuffer',
 'FactExtractionMemoryBlock',
 'InsertMethod',
 'Memory',
 'SimpleComposableMemory',
 'StaticMemoryBlock',
 'VectorMemory',
 'VectorMemoryBlock',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'chat_memory_buffer',
 'chat_summary_memory_buffer',
 'memory',
 'memory_blocks',
 'simple_composable_memory',
 'types',
 'vector_memory']

In [29]:
#model = "qwen3:14b"
model = 'qwen3:14b'
tokenizer_model = "Qwen/Qwen3-14B"
context_window = 1000

In [30]:
llm = Ollama(
    model=model,
    request_timeout=120.0,
    thinking=True,
    context_window=context_window,
)

In [31]:
llm.complete('Hi, how are you ?')

CompletionResponse(text="Hello! I'm just a chatbot, so I don't have feelings, but I'm here and ready to chat! ðŸ˜Š How are you today? Let me know if there's anything I can help with!", additional_kwargs={'tool_calls': None, 'thinking': 'Okay, the user greeted me with "Hi, how are you?" I need to respond appropriately. Since I\'m an AI, I don\'t have feelings, but I should acknowledge their greeting warmly. I should keep the response friendly and open-ended to encourage further conversation. Maybe ask them how they\'re doing and offer help. Let me check the previous examples to ensure consistency. Yep, that\'s right. Make sure to use an emoji to keep it approachable. Alright, time to put it all together.\n'}, raw={'model': 'qwen3:14b', 'created_at': '2025-06-21T18:16:06.720886073Z', 'done': True, 'done_reason': 'stop', 'total_duration': 10741171427, 'load_duration': 5876469963, 'prompt_eval_count': 16, 'prompt_eval_duration': 222075134, 'eval_count': 151, 'eval_duration': 4641725696, 'm

In [32]:
chat_history = [
    ChatMessage(role="user", content="What is LlamaIndex?"),
    ChatMessage(
        role="assistant",
        content="LlamaaIndex is the leading data framework for building LLM applications",
    ),
    ChatMessage(role="user", content="Can you give me some more details?"),
    ChatMessage(
        role="assistant",
        content="""LlamaIndex is a framework for building context-augmented LLM applications. Context augmentation refers to any use case that applies LLMs on top of your private or domain-specific data. Some popular use cases include the following: 
        Question-Answering Chatbots (commonly referred to as RAG systems, which stands for "Retrieval-Augmented Generation"), Document Understanding and Extraction, Autonomous Agents that can perform research and take actions
        LlamaIndex provides the tools to build any of these above use cases from prototype to production. The tools allow you to both ingest/process this data and implement complex query workflows combining data access with LLM prompting.""",
    ),
]

In [33]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

In [34]:
tokenizer.encode('Hi, This is a sentence')

[13048, 11, 1096, 374, 264, 11652]

In [38]:
tokenizer.decode(13048)

'Hi'

In [41]:
prompt = "|system|: You are a  helpful assistant"
for i in tokenizer.encode(prompt):
    print(i, tokenizer.decode(i))

91 |
8948 system
91 |
25 :
1446  You
525  are
264  a
220  
10950  helpful
17847  assistant


In [42]:
messages = [{"role": "user", "content": 'What is the capital of France?'}]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True
)

text

[151644,
 872,
 198,
 3838,
 374,
 279,
 6722,
 315,
 9625,
 30,
 151645,
 198,
 151644,
 77091,
 198]

In [43]:
for i in text:
    print(i, tokenizer.decode(i))

151644 <|im_start|>
872 user
198 

3838 What
374  is
279  the
6722  capital
315  of
9625  France
30 ?
151645 <|im_end|>
198 

151644 <|im_start|>
77091 assistant
198 



In [35]:
summarizer_llm = llm
#tokenizer_fn = tiktoken.encoding_for_model(model).encode
tokenizer_fn = lambda text: tokenizer.encode(text, add_special_tokens=False)
memory = ChatSummaryMemoryBuffer.from_defaults(
    chat_history=chat_history,
    llm=summarizer_llm,
    token_limit=2,
    tokenizer_fn=tokenizer_fn,
)

history = memory.get()

In [44]:
help(ChatSummaryMemoryBuffer)

Help on class ChatSummaryMemoryBuffer in module llama_index.core.memory.chat_summary_memory_buffer:

class ChatSummaryMemoryBuffer(llama_index.core.memory.types.BaseMemory)
 |  ChatSummaryMemoryBuffer(*, token_limit: int, count_initial_tokens: bool = False, llm: Optional[Annotated[llama_index.core.llms.llm.LLM, SerializeAsAny()]] = None, summarize_prompt: Optional[str] = None, tokenizer_fn: Callable[[str], List] = <factory>, chat_store: typing.Annotated[llama_index.core.storage.chat_store.base.BaseChatStore, SerializeAsAny()] = <factory>, chat_store_key: str = 'chat_history') -> None
 |  
 |  Deprecated: Please use `llama_index.core.memory.Memory` instead.
 |  
 |  Buffer for storing chat history that uses the full text for the latest
 |  {token_limit}.
 |  
 |  All older messages are iteratively summarized using the {llm} provided, with
 |  the max number of tokens defined by the {llm}.
 |  
 |  User can specify whether initial tokens (usually a system prompt)
 |  should be counted as

In [36]:
history

[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='The conversation explains that LlamaIndex is a framework for building context-augmented LLM applications, enabling the use of private or domain-specific data with LLMs. Key use cases include Retrieval-Augmented Generation (RAG) systems, document understanding, and autonomous agents. The framework provides tools for data ingestion, processing, and complex query workflows integrating data access with LLM prompting.')])]