In [1]:
import json
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model

from langchain_core.messages import (
    AIMessage,
    AnyMessage,
    HumanMessage,
    RemoveMessage,
    SystemMessage,
    ToolMessage,
)

In [2]:
load_dotenv(override=True)

True

In [3]:
coder_system_prompt = """
You are the Coding Agent in a two-stage pipeline (Planner ➜ Coder).

INPUT
------
You will receive:
1. A “Coding Plan” produced by the Planner Agent.
   • It is an ordered list of numbered tasks.  
   • Each task contains: Task Name, Details, Dependencies, Output.
2. The original user request (for reference only).
3. The structure of the DataFrame to be used in the tasks. This is the output of Pandas' `df.info()` method

OBJECTIVE
---------
Write a **single, fully-runnable Python 3 script** that accomplishes *all* tasks in the Coding Plan, in order, without omission.

STRICT RULES
------------
- **Return only code** – no prose, comments outside the script, or explanations.
- The script must be PEP 8 compliant, self-contained, and ready to run.
- Allowed libraries: Python standard library, NumPy, Pandas, Matplotlib, Scikit-Learn, PyTorch.
- If a task requires plotting, save figures to files (do not display).
- Insert clear inline comments and complete docstrings for every function, class, or complex section.
- If the plan specifies an output file name (e.g., “top_10_customers.png”), save exactly that name.
- Respect all user constraints from the original request.
- **Never ignore or reorder tasks** unless an explicit dependency forces you to combine steps.
- If the plan references data that is undefined (e.g., missing column names), raise a clear
  `ValueError` in the code rather than guessing.
- If any task is impossible with the permitted libraries, stop and raise
  `NotImplementedError` inside the script, citing the task name.

IMPLEMENTATION GUIDELINES
-------------------------
- Begin with all necessary imports.
- Encapsulate each task in a well-named function whose docstring mirrors the task description.
- Provide a `main()` function that calls task-functions in the correct order and writes/prints
  the final results as specified.
- Use type hints where helpful for readability.
- Place the customary `if __name__ == "__main__": main()` guard at the end.

FAIL-SAFE
---------
If you detect that the Coding Plan itself is ambiguous or missing critical information,
raise a `ValueError` at the top of the script explaining which task needs clarification.

OUTPUT FORMAT
-------------
Return the complete Python script **and nothing else**.

"""

In [4]:
tools = (
    [
        {"type": "code_interpreter", "container": {"type": "auto"}},
        {"type": "web_search_preview"},
    ],
)

In [5]:
import io
import os
from dataframe_to_dict import parse_dataframe_info_to_dict

df = pd.read_csv(os.getenv("PROCESSED_DATA_FILE"))
buffer = io.StringIO()
df.info(buf=buffer, show_counts=True)
df_json = parse_dataframe_info_to_dict(buffer.getvalue())


In [6]:
plan = ""
with open("planner_output.json", "r") as f:
    plan = f.read()  # Save the parsed DataFrame info to a file

In [7]:
questions = []

with open(os.getenv('QUESTIONS_FILE')) as f:
    questions = [json.loads(line)['question'] for line in f.readlines()]

In [8]:
system_message = SystemMessage(
    content=coder_system_prompt,
)

df_structure = "DataFrame Structure:\n" + json.dumps(df_json, indent=2)

plan = "Plan: "
with open("planner_output.json", "r") as f:
    plan += f.read()

original_request = questions[0]  # Assuming the original request is the 5th question

human_message = HumanMessage(
    content=plan + "\n\n"
    + "Human Request:\n" + original_request + "\n\n"
    + df_structure)
    
messages = [system_message, human_message]

In [9]:
from pydantic import BaseModel, Field

class CodeResponse(BaseModel):
    code: str = Field(description="The Python code to execute the task.")


In [10]:
llm = init_chat_model("openai:gpt-4.1", temperature=0.7, max_retries=3, output_version="responses/v1")
structured_llm = llm.with_structured_output(schema=CodeResponse)

In [11]:
resp = structured_llm.invoke(messages)

In [12]:
print(resp.code)

import pandas as pd
from typing import Tuple


def load_data(filepath: str = 'data.csv') -> pd.DataFrame:
    """
    Load the dataset from a CSV file into a Pandas DataFrame.
    Args:
        filepath (str): Path to the CSV file.
    Returns:
        pd.DataFrame: DataFrame containing the data.
    """
    df = pd.read_csv(filepath)
    return df


def group_data_by_passenger_class(df: pd.DataFrame) -> pd.core.groupby.DataFrameGroupBy:
    """
    Group the DataFrame by the 'Pclass' column to analyze fares for each class separately.
    Args:
        df (pd.DataFrame): Input DataFrame.
    Returns:
        pd.core.groupby.DataFrameGroupBy: Grouped DataFrame object by 'Pclass'.
    """
    if 'Pclass' not in df.columns or 'Fare' not in df.columns:
        raise ValueError("DataFrame must contain 'Pclass' and 'Fare' columns.")
    grouped = df.groupby('Pclass')
    return grouped


def calculate_fare_statistics(grouped: pd.core.groupby.DataFrameGroupBy) -> pd.DataFrame:
    """
    For

In [13]:
import pandas as pd
from typing import Tuple


def load_data(filepath: str = 'data.csv') -> pd.DataFrame:
    """
    Load the dataset from a CSV file into a Pandas DataFrame.
    Args:
        filepath (str): Path to the CSV file.
    Returns:
        pd.DataFrame: DataFrame containing the data.
    """
    df = pd.read_csv(filepath)
    return df


def group_data_by_passenger_class(df: pd.DataFrame) -> pd.core.groupby.DataFrameGroupBy:
    """
    Group the DataFrame by the 'Pclass' column to analyze fares for each class separately.
    Args:
        df (pd.DataFrame): Input DataFrame.
    Returns:
        pd.core.groupby.DataFrameGroupBy: Grouped DataFrame object by 'Pclass'.
    """
    if 'Pclass' not in df.columns or 'Fare' not in df.columns:
        raise ValueError("DataFrame must contain 'Pclass' and 'Fare' columns.")
    grouped = df.groupby('Pclass')
    return grouped


def calculate_fare_statistics(grouped: pd.core.groupby.DataFrameGroupBy) -> pd.DataFrame:
    """
    For each passenger class, calculate the mean, median, and standard deviation of the 'Fare' column.
    Args:
        grouped (pd.core.groupby.DataFrameGroupBy): Grouped DataFrame by 'Pclass'.
    Returns:
        pd.DataFrame: DataFrame with 'Pclass', mean, median, and std of 'Fare'.
    """
    stats = grouped['Fare'].agg(['mean', 'median', 'std']).reset_index()
    stats.rename(columns={'mean': 'Mean Fare', 'median': 'Median Fare', 'std': 'Fare Std'}, inplace=True)
    return stats


def present_distribution_analysis_results(stats: pd.DataFrame) -> None:
    """
    Present the calculated statistics in a clear, tabular format for comparison across classes.
    Args:
        stats (pd.DataFrame): DataFrame with fare statistics by class.
    Returns:
        None
    """
    print("Fare Distribution Analysis by Passenger Class:\n")
    print(stats.to_string(index=False))


def interpret_results(stats: pd.DataFrame) -> None:
    """
    Interpret the differences in mean, median, and standard deviation of fares between the passenger classes.
    Discuss what these differences suggest about fare distribution and passenger class distinctions.
    Args:
        stats (pd.DataFrame): DataFrame with fare statistics by class.
    Returns:
        None
    """
    print("\nInterpretation of Fare Distribution by Passenger Class:")
    for _, row in stats.iterrows():
        pclass = row['Pclass']
        mean_fare = row['Mean Fare']
        median_fare = row['Median Fare']
        std_fare = row['Fare Std']
        print(f"\nPassenger Class {pclass}:")
        print(f"  - Mean Fare: {mean_fare:.2f}")
        print(f"  - Median Fare: {median_fare:.2f}")
        print(f"  - Standard Deviation: {std_fare:.2f}")
    
    max_mean_class = stats.loc[stats['Mean Fare'].idxmax(), 'Pclass']
    min_mean_class = stats.loc[stats['Mean Fare'].idxmin(), 'Pclass']
    print(f"\nThe mean and median fares decrease as passenger class increases (1st to 3rd), indicating that higher classes paid more for their tickets. ")
    print(f"Class {int(max_mean_class)} has the highest mean fare, reflecting the premium nature of first class. Class {int(min_mean_class)} has the lowest, typical of more affordable accommodation.")
    print("Standard deviation is also higher in higher classes, suggesting a wider range of fares paid, possibly due to different types of cabins or services within those classes.")
    print("Overall, the statistics reflect clear distinctions in fare distribution between the passenger classes, with first class passengers generally paying substantially more.")


def main() -> None:
    df = load_data()
    grouped = group_data_by_passenger_class(df)
    stats = calculate_fare_statistics(grouped)
    present_distribution_analysis_results(stats)
    interpret_results(stats)


main()


Fare Distribution Analysis by Passenger Class:

 Pclass  Mean Fare  Median Fare  Fare Std
      0   0.000000       0.0000       NaN
      1  87.961582      69.3000 80.857189
      2  21.471556      15.0458 13.187429
      3  13.229435       8.0500 10.043158

Interpretation of Fare Distribution by Passenger Class:

Passenger Class 0.0:
  - Mean Fare: 0.00
  - Median Fare: 0.00
  - Standard Deviation: nan

Passenger Class 1.0:
  - Mean Fare: 87.96
  - Median Fare: 69.30
  - Standard Deviation: 80.86

Passenger Class 2.0:
  - Mean Fare: 21.47
  - Median Fare: 15.05
  - Standard Deviation: 13.19

Passenger Class 3.0:
  - Mean Fare: 13.23
  - Median Fare: 8.05
  - Standard Deviation: 10.04

The mean and median fares decrease as passenger class increases (1st to 3rd), indicating that higher classes paid more for their tickets. 
Class 1 has the highest mean fare, reflecting the premium nature of first class. Class 0 has the lowest, typical of more affordable accommodation.
Standard deviation 

In [None]:
["median_fare_class1", "69.30"], ["median_fare_class2", "15.05"], ["std_dev_fare_class1", "80.86"], ["mean_fare_class3", "13.23"], ["std_dev_fare_class2", "13.19"], ["mean_fare_class2", "21.47"], ["std_dev_fare_class3", "10.04"], ["mean_fare_class1", "87.96"]