In [21]:
import llama_cpp

class LLMWithAutoSummarization:
    """
    Chat wrapper around llama-cpp that keeps a single running summary 
    of the whole conversation. The raw dialogue is **never** sent back 
    to the model once it has been summarized..
    """
    
    SYSTEM_PROMPT = (
        "You are a friendly and knowledgeable AI assistant. "
        "IMPORTANT: You have access to a summary of your previous conversation with the user. "
        "This summary contains facts you learned about the user (like their name, preferences, etc.) "
        "and topics you discussed. When answering questions, USE THIS SUMMARY as if it is your memory. "
        "If the user asks about something mentioned in the summary, answer confidently based on that information. "
        "Speak in first person ('I') and address the user directly ('you')."
    )
    
    CONTEXT_WINDOW = 10240
    SUMMARY_TOKEN_BUDGET = 1024
    GENERATION_MAX_TOKENS = 2048
    
    def __init__(self, model_path: str = "/home/ubuntu/ai-engineering/models/qwen2.5-7b-instruct-q5_k_m-00001-of-00002.gguf"):
        self.llm = llama_cpp.Llama(
            model_path=model_path,
            n_gpu_layers=-1,
            n_ctx=self.CONTEXT_WINDOW,
            verbose=False,
        )
        self.summary: str = ""
        self.generation_params = {
            "temperature": 0.7,
            "top_p": 0.9,
            "stop": ["<|im_end|>", "<|endoftext|>"],
            "max_tokens": self.GENERATION_MAX_TOKENS,
        }
    
    def answer(self, user_text: str) -> str:
        """Generate a reply and immediately update the running summary."""
        # --- Phase 1: Answer ---
        prompt_msgs = [{"role": "system", "content": self.SYSTEM_PROMPT}]
        
        if self.summary:
            prompt_msgs.append({
                "role": "system",
                "content": (
                    "HERE IS WHAT YOU KNOW FROM YOUR PREVIOUS CONVERSATION:\n\n"
                    f"{self.summary}\n\n"
                    "Use this information to answer the user's questions. "
                    "If they ask about their name, location, preferences, or anything else "
                    "mentioned above, provide that information confidently."
                )
            })
        
        prompt_msgs.append({"role": "user", "content": user_text})
        
        reply = self.llm.create_chat_completion(
            messages=prompt_msgs,
            **self.generation_params
        )["choices"][0]["message"]["content"]
        
        # --- Phase 2: Update memory ---
        self._update_summary(user_text, reply)
        
        return reply
    
    def _update_summary(self, user_text: str, assistant_text: str) -> None:
        """
        Roll the latest exchange into the running summary.
        Uses the same model but lower temperature and token limit.
        """
        SUMMARIZATION_SYSTEM_PROMPT = """
        You are creating a memory summary. Output ONLY the summary itself with no extra commentary. 
        Format:
        USER PROFILE: 
        - Name: [if mentioned] 
        - Location: [if mentioned] 
        - Other details: [list any other personal info] 
        
        CONVERSATION TOPICS: 
        - [topic 1] 
        - [topic 2] 
        
        PREFERENCES/REQUESTS: 
        - [any stated preferences] 
        
        Rules: 
        1. ALWAYS keep the user's name if it was ever mentioned 
        2. Keep it concise but never lose important facts 
        3. NO explanations or notes - output ONLY the formatted summary 
        4. If old topics become irrelevant, you can remove them 
        5. Consolidate similar information
        """
        
        text_to_summarize = f"""
        Previous summary:
        {self.summary or 'No previous conversation'}
        
        New exchange:
        User: {user_text}
        Assistant: {assistant_text}
        
        Output the updated summary using the format above. Remember: NO extra commentary, just the summary.
        """
        
        summarization_messages = [
        {"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT},
        {"role": "user", "content": text_to_summarize}
        ]
        
        new_summary = llm.create_chat_completion(
        messages=summarization_messages,
        temperature=0.1, # Low temp for determined output
        max_tokens=SUMMARY_TOKEN_BUDGET,
        stop=["<|im_end|>", "<|endoftext|>"],
        )["choices"][0]["message"]["content"]
        
        self.summary = new_summary.strip()
    
    def print_memory(self) -> None:
        """Inspect the current summary."""
        print("\n=== Running Summary ===")
        print(self.summary or "(empty)")
        print("=======================\n")

In [18]:
llm_with_memory = LLMWithAutoSummarization()
print(llm_with_memory.answer("My name is Ali.")) # Turn 1 

# Simulate 10 more turns of conversation about something else 
for i in range(10):
 print(f"--- Turn {i+2} ---")
 print(llm_with_memory.answer(f"Tell me a random fact about the number {i}."))
 # Now, ask the critical question again 
 print("--- The Moment of Truth ---")

print(llm_with_memory.answer("What is my name?"))

llama_context: n_ctx_per_seq (10240) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Hello Alice! It's nice to meet you again. How can I assist you today?
--- Turn 2 ---
Hello Alice! Did you know that the number 0 is considered an even number? It's unique because it's the only integer that is neither positive nor negative, but it plays a crucial role in the system of even and odd numbers. Isn't that interesting?
--- The Moment of Truth ---
Your name is Alice. It's great to chat with you again, Alice!
--- Turn 3 ---
Hello Alice! Did you know that the number 1 is considered neither prime nor composite? It's a unique number with its own special properties in mathematics.
--- The Moment of Truth ---
Your name is Alice. It was nice to greet you earlier, Alice!
--- Turn 4 ---
Hello Alice! Did you know that the number 2 is considered the first prime number and the only even prime number? Isn't that interesting?
--- The Moment of Truth ---
Your name is Alice. It was nice to greet you earlier, Alice!
--- Turn 5 ---
Hello Alice! Did you know that the number 3 is considered a luc

In [22]:
print(llm.answer("My name is Ali. I am a cloud engineer.")) 
llm_with_memory.print_memory() 
print(llm.answer("I'm working on a weapon detection project.")) 
llm_with_memory.print_memory() 
print(llm.answer("I prefer Python over R becuase python has much larger ecosystem and is relatively easier for me to understand."))
llm_with_memory.print_memory() 
print(llm.answer("What programming language should I use for deep learning?"))
llm_with_memory.print_memory()

AttributeError: 'LLMWithAutoSummarization' object has no attribute 'create_chat_completion'

In [None]:
setup = """
Here are some things to remember:
- My deployment bucket is s3://proj-847-staging-west
- The build flag is --env=qa-cluster-3  
- My SSH alias is devbox-7b
- The port I always use is 9473
- My teammate's code review tag is @chen-review-squad
"""

print(llm.answer(setup))
llm.print_memory()

# Turn 2-6: Distractor chat
# We need to push the first instruction out of the immediate context
distractors = [
    "How do I center a div?",
    "What is the capital of France?",
    "Explain the difference between TCP and UDP.",
    "Write a haiku about coding.",
    "What is 12 * 12?"
]

for d in distractors:
    print(f"User: {d}")
    print(f"Bot: {llm.answer(d)}")

# The Moment of Truth
print("--- RETRIEVAL TEST ---")
print(llm.answer("What port do I always use?"))
llm.print_memory()