In [21]:
# Windows handles forward slashes correctly in file paths
import json
from pathlib import Path

# Simulate what happens when you read from JSON
json_data = '{"file_name": "data/InfiAgent-DABench/da-dev-tables/abalone.csv"}'
loaded = json.loads(json_data)
file_path_from_json = loaded["file_name"]

print(f"Path from JSON: {file_path_from_json}")

# Convert to Path object - works fine with forward slashes
path_obj = Path(file_path_from_json)
print(f"Path object: {path_obj}")
print(f"Path exists: {path_obj.exists()}")

# If you need native Windows separators, you can get them
print(f"With native separators: {path_obj.as_posix().replace('/', '\\\\')}")

print("\\nKey points:")
print("✓ Windows accepts forward slashes")
print("✓ Path() works with forward slashes") 
print("✓ No conversion needed when reading from JSON")

Path from JSON: data/InfiAgent-DABench/da-dev-tables/abalone.csv
Path object: data\InfiAgent-DABench\da-dev-tables\abalone.csv
Path exists: True
With native separators: data\\InfiAgent-DABench\\da-dev-tables\\abalone.csv
\nKey points:
✓ Windows accepts forward slashes
✓ Path() works with forward slashes
✓ No conversion needed when reading from JSON


In [1]:
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

load_dotenv(override=True)

questions = Path(os.getenv("QUESTIONS_FILE"))
answers = Path(os.getenv("ANSWERS_FILE"))
# df_json = pd.read_json(questions, lines=True).to_dict(orient='records')
df_questions = pd.read_json(questions, lines=True)
df_answers = pd.read_json(answers, lines=True)
df_merged = df_answers.merge(df_questions, left_on="id", right_on="id", how='inner')

In [2]:
import io
from dataframe_to_dict import parse_dataframe_info

def df_info_to_json(df):
    buffer = io.StringIO()
    df.info(buf=buffer, show_counts=True)
    df_json = parse_dataframe_info(buffer.getvalue())
    return df_json    

In [3]:
from planner import create_plan

path_prefix = Path("data/InfiAgent-DABench/da-dev-tables/")
for index, row in df_merged.iterrows():
    file_name = path_prefix / row['file_name']
    df = pd.read_csv(file_name)
    df_json = df_info_to_json(df)
    plan = create_plan(row['question'], df_json, file_name.as_posix())
    df_merged.at[index, 'plan'] = plan.model_dump_json()


In [4]:
df_merged.loc[14, 'plan']



In [5]:
df_merged.to_csv("data/merged_with_plans.csv", index=False)

In [None]:
from coder import create_code

path_prefix = Path("data/InfiAgent-DABench/da-dev-tables/")
for index, row in df_merged.iterrows():
    df = pd.read_csv(path_prefix / row['file_name'])
    df_json = df_info_to_json(df)
    code = create_code(row['plan'], row['question'], df_json, row['file_name'])
    df_merged.at[index, 'code'] = code.model_dump_json()

In [7]:
df_merged.to_csv("data/merged_with_code.csv", index=False)

In [35]:
import json
print(json.loads(df_merged.loc[14, 'plan']))

{'task_list': [{'task_name': 'Load the dataset', 'details': "Read the CSV file 'data/InfiAgent-DABench/da-dev-tables/my_test_01.csv' into a Pandas DataFrame.", 'dependencies': 'None', 'output': 'Loaded DataFrame containing all columns.', 'assumptions': 'CSV file and all columns exist as described in the data_frame_structure.'}, {'task_name': 'Replace missing values in MedInc column with mean', 'details': "Check the 'MedInc' column for missing values and replace any missing entries with the mean of the column.", 'dependencies': 'Loaded DataFrame', 'output': "DataFrame with missing values in 'MedInc' replaced by the column mean.", 'assumptions': 'Missing values are represented as NaN.'}, {'task_name': 'Standardize AveOccup column using z-scores', 'details': "Calculate the z-score for each value in the 'AveOccup' column and replace the column with its standardized version.", 'dependencies': "DataFrame with missing values in 'MedInc' handled", 'output': "DataFrame with the 'AveOccup' colum

In [34]:
import pandas as pd
import numpy as np
from typing import Tuple

def load_dataset(filepath: str) -> pd.DataFrame:
    """
    Read the CSV file 'data/InfiAgent-DABench/da-dev-tables/my_test_01.csv' into a Pandas DataFrame.
    """
    df = pd.read_csv(filepath)
    return df

def replace_missing_medinc_with_mean(df: pd.DataFrame) -> pd.DataFrame:
    """
    Check the 'MedInc' column for missing values and replace any missing entries with the mean of the column.
    """
    if 'MedInc' not in df.columns:
        raise ValueError("Column 'MedInc' does not exist in the DataFrame.")
    mean_value = df['MedInc'].mean()
    df['MedInc'] = df['MedInc'].fillna(mean_value)
    return df

def standardize_aveoccup(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the z-score for each value in the 'AveOccup' column and replace the column with its standardized version.
    """
    if 'AveOccup' not in df.columns:
        raise ValueError("Column 'AveOccup' does not exist in the DataFrame.")
    aveoccup_mean = df['AveOccup'].mean()
    aveoccup_std = df['AveOccup'].std()
    if aveoccup_std == 0:
        raise ValueError("Standard deviation of 'AveOccup' is zero, cannot standardize.")
    df['AveOccup'] = (df['AveOccup'] - aveoccup_mean) / aveoccup_std
    return df

def create_rooms_per_person(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a new column 'RoomsPerPerson' to the DataFrame by dividing 'AveRooms' by 'Population' for each row.
    """
    if 'AveRooms' not in df.columns or 'Population' not in df.columns:
        raise ValueError("Required columns 'AveRooms' or 'Population' are missing in the DataFrame.")
    if (df['Population'] == 0).any():
        raise ValueError("'Population' column contains zero(s), cannot divide by zero.")
    df['RoomsPerPerson'] = df['AveRooms'] / df['Population']
    return df

def calculate_pearson_corr(df: pd.DataFrame) -> float:
    """
    Compute the Pearson correlation coefficient between the 'MedianHouseValue' and 'RoomsPerPerson' columns.
    """
    if 'MedianHouseValue' not in df.columns or 'RoomsPerPerson' not in df.columns:
        raise ValueError("Required columns 'MedianHouseValue' or 'RoomsPerPerson' are missing in the DataFrame.")
    corr = df['MedianHouseValue'].corr(df['RoomsPerPerson'])
    return corr

def calculate_mean_std_medianhousevalue(df: pd.DataFrame) -> Tuple[float, float]:
    """
    Compute the mean and standard deviation of the 'MedianHouseValue' column.
    """
    if 'MedianHouseValue' not in df.columns:
        raise ValueError("Column 'MedianHouseValue' does not exist in the DataFrame.")
    mean_val = df['MedianHouseValue'].mean()
    std_val = df['MedianHouseValue'].std()
    return mean_val, std_val

def main() -> None:
    # Task 1: Load the dataset
    df = load_dataset('data/InfiAgent-DABench/da-dev-tables/my_test_01.csv')
    # Task 2: Replace missing values in MedInc column with mean
    df = replace_missing_medinc_with_mean(df)
    # Task 3: Standardize AveOccup column using z-scores
    df = standardize_aveoccup(df)
    # Task 4: Create RoomsPerPerson feature
    df = create_rooms_per_person(df)
    # Task 5: Calculate Pearson correlation coefficient between MedianHouseValue and RoomsPerPerson
    pearson_corr = calculate_pearson_corr(df)
    print(f"Pearson correlation coefficient between MedianHouseValue and RoomsPerPerson: {pearson_corr}")
    # Task 6: Calculate mean and standard deviation of MedianHouseValue
    mean_val, std_val = calculate_mean_std_medianhousevalue(df)
    print(f"Mean of MedianHouseValue: {mean_val}")
    print(f"Standard deviation of MedianHouseValue: {std_val}")

if __name__ == "__main__":
    main()

Pearson correlation coefficient between MedianHouseValue and RoomsPerPerson: 0.0382234007362979
Mean of MedianHouseValue: 2.1225820930232557
Standard deviation of MedianHouseValue: 1.2209690905546158


In [14]:
df_merged.loc[14, 'common_answers']

[['pearson_coefficient', '0.0382'], ['mean_value', '2.1226']]

In [15]:
df_merged.loc[14, 'file_name']

'my_test_01.csv'