In [11]:
import json
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model

from langchain_core.messages import (
    AIMessage,
    AnyMessage,
    HumanMessage,
    RemoveMessage,
    SystemMessage,
    ToolMessage,
)

In [12]:
load_dotenv(override=True)

True

In [13]:
import io
from dataframe_to_dict import parse_dataframe_info_to_dict

df = pd.read_csv(os.getenv("PROCESSED_DATA_FILE"))
buffer = io.StringIO()
df.info(buf=buffer, show_counts=True)
df_json = parse_dataframe_info_to_dict(buffer.getvalue())

In [14]:
planner_system_prompt = """
You are the Planner Agent for a Python Data Science and Machine Learning coding assistant. 
Your job is to create a structured coding plan to ensure no part of the user request is overlooked. 

You do NOT write code. You ONLY produce a plan for the Coding Agent to implement.

Allowed libraries: Python standard library, NumPy, Pandas, Matplotlib, Scikit-Learn.

PLAN REQUIREMENTS
-----------------
1. Each plan must decompose the user’s request into discrete, ordered tasks.
2. Tasks must be sequential, unambiguous, and self-contained.
3. If any data is needed but not clearly provided:
   - Assume it is in a CSV file.
   - Include a **first task** to load it using Pandas.
   - Assign a reasonable default filename like `data.csv` unless one is specified.
4. If something is ambiguous, do NOT ask the user.
   - State your assumptions clearly in the task’s description.

You must not skip or merge tasks unless explicitly redundant.

"""

In [15]:
from pydantic import BaseModel, Field
from typing import List

class Task(BaseModel):
    task_name: str = Field(description="Short description of the coding task.")
    details: str = Field(description="Step-by-step description of what must be done, including any transformations or conditions.")
    dependencies: str = Field(description="Data or previous tasks this step depends on.")
    output: str = Field(description="What this task should produce.")
    assumptions: str = Field(description="Any assumptions made to proceed with the task, especially if the user request was unclear.")

class Plan(BaseModel):
    task_list : List[Task]

In [16]:
questions = []

with open(os.getenv('QUESTIONS_FILE')) as f:
    questions = [json.loads(line)['question'] for line in f.readlines()]

In [17]:
questions[0]

"Perform a distribution analysis on the 'Fare' column for each passenger class ('Pclass') separately. Calculate the mean, median, and standard deviation of the fare for each class. Interpret the results in terms of the different passenger classes."

In [18]:
system_message = SystemMessage(
    content=planner_system_prompt,
)

df_structure = "DataFrame Structure:\n" + json.dumps(df_json, indent=2)

human_message = HumanMessage(
    content=questions[0] + "\n\n"
      + df_structure)
    
messages = [system_message, human_message]

In [19]:
llm = init_chat_model("openai:gpt-4.1", temperature=0.7, max_retries=3, output_version="responses/v1")
structured_llm = llm.with_structured_output(schema=Plan)
resp = structured_llm.invoke(messages)

In [20]:
with open("planner_output.json", "w") as f:
    json.dump(resp.model_dump(), f, indent=2)  # Save the structured response to a file