My project explores how different types of memory affect the performance of intelligent code-generating agents in a collaborative software development environment. The environment simulates a real-world development pipeline with three key agent roles: (1) Code Agent, which writes and iterates on code until tests pass, (2) Review Agent, which generates new test cases to ensure acceptance criteria are met, and (3) Orchestrator/Product Manager, which converts user requests into structured user stories, writes test cases, and defines acceptance criteria.

The central research question is: How does the type of memory used by agents impact the quality and efficiency of code development?

We will compare three memory setups: (1) basic memory (tracking past actions), (2) summarised "learnings" generated by a language model from past interactions, and (3) agentic memory (a-mem), which allows more advanced, context-aware recall. Agents will be evaluated across metrics such as the number of bugs found, number of errors encountered, iteration counts, and overall rate of improvement.

Experiments will be conducted across tasks starting with mostly basic python problem dataset from google and humaneval from openai to measure performance variations. The results could provide insights into how memory mechanisms can be optimized for multi-agent collaboration in intelligent systems.

--- 

CODE AGENT 
- Writes only code given task

REVIEW AGENT
- Writes tests for given task and checks if code passes / fails

ORCHESTRATOR / PM
- converts user requests into structured prompts
- Performs final tests using acceptance criteria

Note: Acceptance criteria is derived from MBPP test_list

---



In [1]:
import os, io, datetime
from contextlib import redirect_stdout
from dotenv import load_dotenv
import pandas as pd
from datasets import load_dataset
from google import genai
from ollama import chat, ChatResponse
from pydantic import BaseModel
from typing import List, Callable

# load both Gemini and Ollama settings
load_dotenv()
MODEL_BACKEND = os.getenv("MODEL_BACKEND", "gemini")      # or "ollama"
GEMINI_MODEL  = os.getenv("GEMINI_MODEL",  "gemini-2.5-pro-exp-03-25")
OLLAMA_MODEL  = os.getenv("OLLAMA_MODEL",  "gemma3:1b")

client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

In [2]:
pd.set_option('display.max_colwidth', None)  # or use 0 for unlimited in newer versions
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Load the MBPP dataset
mbpp_dataset = load_dataset("Muennighoff/mbpp")

mbpp_dataset

DatasetDict({
    test: Dataset({
        features: ['task_id', 'text', 'code', 'test_list', 'test_setup_code', 'challenge_test_list'],
        num_rows: 974
    })
})

In [4]:
mbpp_data = mbpp_dataset['test']

In [5]:
mbpp_df = pd.DataFrame(mbpp_data)
task_1_df = mbpp_df[mbpp_df['task_id'] == 1]

In [6]:
task_1_df['test_list'][0]

['assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8',
 'assert min_cost([[2, 3, 4], [5, 9, 3], [2, 6, 4]], 2, 2) == 12',
 'assert min_cost([[3, 4, 5], [6, 10, 4], [3, 7, 5]], 2, 2) == 16']

In [7]:
# ORCHESTRATOR / PM

print(task_1_df.text)

task = task_1_df

0    Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].
Name: text, dtype: object


In [8]:
def memory():
    pass

In [9]:
task.text[0]

'Write a function to find the minimum cost path to reach (m, n) from (0, 0) for the given cost matrix cost[][] and a position (m, n) in cost[][].'

In [10]:

class CodeAgentResponse(BaseModel):
    function_name: str
    code: str

    @property
    def function(self) -> Callable:
        namespace: dict = {}
        exec(self.code, namespace)
        func = namespace.get(self.function_name)
        if func is None or not callable(func):
            raise ValueError(f"Function {self.function_name} not found after exec.")
        return func

class ReviewAgentResponse(BaseModel):
    test_cases: List[str]

In [11]:
def code_agent(requirement: str) -> CodeAgentResponse:
    if MODEL_BACKEND == "gemini":
        resp = client.models.generate_content(
            model=GEMINI_MODEL,
            contents=requirement,
            config={
                "response_mime_type": "application/json",
                "response_schema": CodeAgentResponse,
            },
        )
        return resp.parsed

    elif MODEL_BACKEND == "ollama":
        prompt = (
            requirement + "\n\nUse this JSON schema:\n"
            "CodeAgent = {'function_name': str, 'code': str}\nReturn: CodeAgent"
        )
        resp: ChatResponse = chat(
            model=OLLAMA_MODEL,
            messages=[{"role":"user","content":prompt}],
        )
        return CodeAgentResponse.parse_raw(resp.message.content)

    else:
        raise ValueError(f"Unknown MODEL_BACKEND: {MODEL_BACKEND}")

In [12]:
code_agent_response = code_agent(task.text[0])

In [13]:
min_cost_fn = code_agent_response.function

In [16]:
code_agent_response.code

'def min_cost_path(cost, m, n):\n    """Finds the minimum cost path to reach (m, n) from (0, 0) in the given cost matrix.\n\n    Args:\n        cost: A 2D list representing the cost matrix.\n        m: The row index of the destination.\n        n: The column index of the destination.\n\n    Returns:\n        The minimum cost to reach (m, n) from (0, 0).\n    """\n    dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]\n\n    dp[0][0] = cost[0][0]\n\n    # Initialize first column\n    for i in range(1, m + 1):\n        dp[i][0] = dp[i-1][0] + cost[i][0]\n\n    # Initialize first row\n    for j in range(1, n + 1):\n        dp[0][j] = dp[0][j-1] + cost[0][j]\n\n    # Construct the rest of the DP table\n    for i in range(1, m + 1):\n        for j in range(1, n + 1):\n            dp[i][j] = cost[i][j] + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])\n\n    return dp[m][n]'

In [14]:
min_cost_fn

<function min_cost_path(cost, m, n)>

In [17]:
def review_agent(requirement: str, code: str) -> ReviewAgentResponse:
    if MODEL_BACKEND == "gemini":
        prompt = (
            "You are a review agent. ONLY respond with JSON containing 'test_cases' as a list of Python assert statements. NO commentary."
            f"\nRequirement: {requirement}"
            f"\nFunction code:\n{code}"
        )
        resp = client.models.generate_content(
            model=GEMINI_MODEL,
            contents=prompt,
            config={
                "response_mime_type": "application/json",
                "response_schema": ReviewAgentResponse,
            },
        )
        return resp.parsed

    elif MODEL_BACKEND == "ollama":
        prompt = (
            requirement + "\n\nUse this JSON schema:\n"
            "ReviewAgent = {'test_cases': list[str]}\nReturn: ReviewAgent"
        )
        resp: ChatResponse = chat(
            model=OLLAMA_MODEL,
            messages=[{"role":"user","content":prompt}],
        )
        return ReviewAgentResponse.parse_raw(resp.message.content)

    else:
        raise ValueError(f"Unknown MODEL_BACKEND: {MODEL_BACKEND}")

In [18]:
requirement = task.text[0]
review_agent_response = review_agent(requirement, code_agent_response.code)

In [19]:
review_agent_response

ReviewAgentResponse(test_cases=['assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8', 'assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 1, 1) == 9', 'assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 0, 0) == 1', 'assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 0, 1) == 3', 'assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 1, 0) == 5', 'assert min_cost_path([[5]], 0, 0) == 5', 'assert min_cost_path([[1,2],[3,4]],1,1) == 8', 'assert min_cost_path([[4,5,6],[1,2,3],[7,8,9]],2,2) == 21'])

In [20]:
review_agent_response.test_cases

['assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 2, 2) == 8',
 'assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 1, 1) == 9',
 'assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 0, 0) == 1',
 'assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 0, 1) == 3',
 'assert min_cost_path([[1, 2, 3], [4, 8, 2], [1, 5, 3]], 1, 0) == 5',
 'assert min_cost_path([[5]], 0, 0) == 5',
 'assert min_cost_path([[1,2],[3,4]],1,1) == 8',
 'assert min_cost_path([[4,5,6],[1,2,3],[7,8,9]],2,2) == 21']

In [21]:
results = []
ns = {code_agent_response.function_name: code_agent_response.function}
for tc in review_agent_response.test_cases:
    buf = io.StringIO()
    try:
        with redirect_stdout(buf):
            exec(tc, ns)
        status, err = "pass", ""
    except AssertionError as e:
        status, err = "fail", str(e)
    except Exception as e:
        status, err = "error", str(e)
    results.append({
        "requirement": requirement,
        "fn_name": code_agent_response.function_name,
        "test": tc,
        "status": status,
        "error": err,
        "stdout": buf.getvalue(),
        "timestamp": datetime.datetime.now(),
    })

df_logs = pd.DataFrame(results)
log_file = "test_logs.csv"
if os.path.exists(log_file):
    df_logs.to_csv(log_file, mode="a", header=False, index=False)
else:
    df_logs.to_csv(log_file, index=False)

print(f"Logged {len(results)} tests to {log_file}")

Logged 8 tests to test_logs.csv


In [None]:
# parameters (in‐notebook instead of argparse)
eval_type = "mbpp"      # or "humaneval"
num_tasks = 10          # or None for all

if eval_type=="mbpp":
    ds = load_dataset("Muennighoff/mbpp")["test"]
    get_tests = lambda r: r["test_list"]
    get_req   = lambda r: r["text"]
else:
    ds = load_dataset("openai/openai_humaneval")["test"]
    get_tests = lambda r: r["test_list"]
    get_req   = lambda r: r["prompt"]

logs = []
for idx, rec in enumerate(ds):
    if num_tasks and idx>=num_tasks: break
    req = get_req(rec)
    code_r = code_agent(req)
    rev_r  = review_agent(req, code_r.code)
    fn     = code_r.function
    ns     = {code_r.function_name: fn}

    ds_pass = ds_fail = 0
    for tc in get_tests(rec):
        try: exec(tc, ns); ds_pass+=1
        except: ds_fail+=1

    logs.append({
      "eval_type":eval_type,
      "task_id":   rec.get("task_id",idx),
      "requirement":req,
      "dataset_pass":ds_pass,
      "dataset_fail":ds_fail,
      "timestamp": datetime.datetime.now(),
    })

df = pd.DataFrame(logs)
fname = "orchestrator_logs.csv"
df.to_csv(fname, index=False, mode="a", header=not os.path.exists(fname))
print(f"Saved {len(logs)} rows to {fname}")