In [12]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model

from langchain_core.messages import (
    AIMessage,
    AnyMessage,
    HumanMessage,
    RemoveMessage,
    SystemMessage,
    ToolMessage,
)

In [13]:
load_dotenv(override=True)

True

In [14]:
import json
import re
from typing import Dict, Any


def parse_dataframe_info(info_output: str) -> str:
    """
    Parse the string output of DataFrame.info() into a JSON object.
    
    This function extracts column information from pandas DataFrame.info() output,
    ignoring the header lines and footer statistics. It focuses only on parsing
    the column details including index, name, non-null count, and data type.
    
    Args:
        info_output (str): The string output from DataFrame.info()
        
    Returns:
        str: JSON string containing parsed column information
        
    Raises:
        ValueError: If the input format is not recognized or cannot be parsed
    """
    lines = info_output.strip().split('\n')
    columns_data = []
    
    # Find the start of column data (skip first 3 lines)
    column_start_index = 3
    
    # Find where column data ends (before dtypes: or memory usage:)
    column_end_index = len(lines)
    for i, line in enumerate(lines[column_start_index:], column_start_index):
        if line.strip().startswith('dtypes:') or line.strip().startswith('memory usage:'):
            column_end_index = i
            break
    
    # Parse each column line
    for line in lines[column_start_index:column_end_index]:
        line = line.strip()
        
        # Skip separator lines (lines with only dashes and spaces)
        if re.match(r'^[-\s]+$', line) or not line:
            continue
            
        # Parse column information using regex
        # Pattern matches: index, column_name, non_null_count, dtype
        pattern = r'^\s*(\d+)\s+(\S+)\s+(\d+)\s+non-null\s+(\S+)\s*$'
        match = re.match(pattern, line)
        
        if match:
            column_index = int(match.group(1))
            column_name = match.group(2)
            non_null_count = int(match.group(3))
            dtype = match.group(4)
            
            column_info = {
                'index': column_index,
                'column_name': column_name,
                'non_null_count': non_null_count,
                'dtype': dtype
            }
            
            columns_data.append(column_info)
    
    # Create the final JSON structure
    result = {
        'columns': columns_data,
        'total_columns': len(columns_data)
    }
    
    return json.dumps(result, indent=2)


def parse_dataframe_info_to_dict(info_output: str) -> Dict[str, Any]:
    """
    Parse the string output of DataFrame.info() into a Python dictionary.
    
    This function is similar to parse_dataframe_info() but returns a dictionary
    instead of a JSON string, which can be more convenient for further processing.
    
    Args:
        info_output (str): The string output from DataFrame.info()
        
    Returns:
        Dict[str, Any]: Dictionary containing parsed column information
        
    Raises:
        ValueError: If the input format is not recognized or cannot be parsed
    """
    json_string = parse_dataframe_info(info_output)
    return json.loads(json_string)

In [15]:
import io

df = pd.read_csv("curated/DEVRT-DACIA-SPRING.csv")
buffer = io.StringIO()
df.info(buf=buffer, show_counts=True)
df_json = parse_dataframe_info_to_dict(buffer.getvalue())

In [16]:
planner_system_prompt = """
You are the Planner Agent for a Python Data Science and Machine Learning coding assistant. 
Your job is to create a structured coding plan to ensure no part of the user request is overlooked. 

- You do not write code. 
- You only produce a plan for the Coding Agent to implement.
- Each plan must cover every part of the user request as discrete tasks.
- Tasks should be sequential and unambiguous.
- If the user request is unclear or ambiguous, flag which tasks require clarification.
- Include input requirements and expected outputs for each task if applicable.
- Your output should be in a structured format that is easy for a Coding Agent to follow.

Do not skip any part of the user’s request.
Do not make assumptions that are not explicitly stated.
"""

In [17]:
from pydantic import BaseModel, Field
from typing import List

class Task(BaseModel):
    task_name: str = Field(description="Short description of the coding task.")
    details: str = Field(description="Step-by-step description of what must be done, including any transformations or conditions.")
    dependencies: str = Field(description="Data or previous tasks this step depends on.")
    output: str = Field(description="What this task should produce.")

class Plan(BaseModel):
    task_list : List[Task]

In [18]:
llm = init_chat_model("openai:gpt-4.1", temperature=0.7, max_retries=3, output_version="responses/v1")
structured_llm = llm.with_structured_output(schema=Plan)

In [19]:
system_message = SystemMessage(
    content=planner_system_prompt,
)

human_message = HumanMessage(
    content="What is the correlation coefficient between altitude and average speed? Fill missing values with column mean."
    "Calculate Pearson correlation between 'altitude' and 'speedAvg' columns. Round to 3 decimal places. Expect single numerical value else " \
    "Assume the dataframe has been loaded and is available as `df`." \
    "Structured of the DataFrame:\n" +
    json.dumps(df_json, indent=2))
messages = [system_message, human_message]

In [20]:
resp = structured_llm.invoke(messages)

In [28]:
with open("planner_output.json", "w") as f:
    json.dump(resp.model_dump(), f, indent=2)  # Save the structured response to a file