In [21]:
# Windows handles forward slashes correctly in file paths
import json
from pathlib import Path

# Simulate what happens when you read from JSON
json_data = '{"file_name": "data/InfiAgent-DABench/da-dev-tables/abalone.csv"}'
loaded = json.loads(json_data)
file_path_from_json = loaded["file_name"]

print(f"Path from JSON: {file_path_from_json}")

# Convert to Path object - works fine with forward slashes
path_obj = Path(file_path_from_json)
print(f"Path object: {path_obj}")
print(f"Path exists: {path_obj.exists()}")

# If you need native Windows separators, you can get them
print(f"With native separators: {path_obj.as_posix().replace('/', '\\\\')}")

print("\\nKey points:")
print("✓ Windows accepts forward slashes")
print("✓ Path() works with forward slashes") 
print("✓ No conversion needed when reading from JSON")

Path from JSON: data/InfiAgent-DABench/da-dev-tables/abalone.csv
Path object: data\InfiAgent-DABench\da-dev-tables\abalone.csv
Path exists: True
With native separators: data\\InfiAgent-DABench\\da-dev-tables\\abalone.csv
\nKey points:
✓ Windows accepts forward slashes
✓ Path() works with forward slashes
✓ No conversion needed when reading from JSON


In [22]:
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

load_dotenv(override=True)

questions = Path(os.getenv("QUESTIONS_FILE"))
answers = Path(os.getenv("ANSWERS_FILE"))
# df_json = pd.read_json(questions, lines=True).to_dict(orient='records')
df_questions = pd.read_json(questions, lines=True)
df_answers = pd.read_json(answers, lines=True)
df_merged = df_answers.merge(df_questions, left_on="id", right_on="id", how='inner')

In [23]:
import io
from dataframe_to_dict import parse_dataframe_info

def df_info_to_json(df):
    buffer = io.StringIO()
    df.info(buf=buffer, show_counts=True)
    df_json = parse_dataframe_info(buffer.getvalue())
    return df_json    

In [24]:
from planner import create_plan

path_prefix = Path("data/InfiAgent-DABench/da-dev-tables/")
for index, row in df_merged.iterrows():
    file_name = path_prefix / row['file_name']
    df = pd.read_csv(file_name)
    df_json = df_info_to_json(df)
    plan = create_plan(row['question'], df_json, file_name.as_posix())
    df_merged.at[index, 'plan'] = plan.model_dump_json()


In [25]:
df_merged.to_csv("data/merged_with_plans.csv", index=False)

In [None]:
from coder import create_code

path_prefix = Path("data/InfiAgent-DABench/da-dev-tables/")
for index, row in df_merged.iterrows():
    file_name = path_prefix / row['file_name']
    df = pd.read_csv(path_prefix / row['file_name'])
    df_json = df_info_to_json(df)    
    code = create_code(row['plan'], row['question'], df_json)
    df_merged.at[index, 'code'] = code.model_dump_json()

In [7]:
df_merged.to_csv("data/merged_with_code.csv", index=False)

In [8]:
import json
print(json.loads(df_merged.loc[14, 'code'])['code'])

import pandas as pd
import numpy as np
from typing import Tuple

def load_dataset(filename: str = 'data.csv') -> pd.DataFrame:
    """
    Read the dataset into a Pandas DataFrame from a CSV file.
    """
    df = pd.read_csv(filename)
    return df

def replace_missing_medinc(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replace any missing values in the 'MedInc' column with the mean of the non-missing values.
    """
    if 'MedInc' not in df.columns:
        raise ValueError("Column 'MedInc' not found in DataFrame.")
    mean_medinc = df['MedInc'].mean()
    df['MedInc'] = df['MedInc'].fillna(mean_medinc)
    return df

def standardize_aveoccup(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize the 'AveOccup' column using z-scores.
    """
    if 'AveOccup' not in df.columns:
        raise ValueError("Column 'AveOccup' not found in DataFrame.")
    mean_aveoccup = df['AveOccup'].mean()
    std_aveoccup = df['AveOccup'].std()
    if std_aveoccup == 0:
        raise ValueErro

In [None]:
import pandas as pd
import numpy as np
from typing import Tuple

def load_dataset(filename: str = 'data.csv') -> pd.DataFrame:
    """
    Read the dataset into a Pandas DataFrame from a CSV file.
    """
    df = pd.read_csv(filename)
    return df

def replace_missing_medinc(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replace any missing values in the 'MedInc' column with the mean of the non-missing values.
    """
    if 'MedInc' not in df.columns:
        raise ValueError("Column 'MedInc' not found in DataFrame.")
    mean_medinc = df['MedInc'].mean()
    df['MedInc'] = df['MedInc'].fillna(mean_medinc)
    return df

def standardize_aveoccup(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize the 'AveOccup' column using z-scores.
    """
    if 'AveOccup' not in df.columns:
        raise ValueError("Column 'AveOccup' not found in DataFrame.")
    mean_aveoccup = df['AveOccup'].mean()
    std_aveoccup = df['AveOccup'].std()
    if std_aveoccup == 0:
        raise ValueError("Standard deviation of 'AveOccup' is zero, cannot standardize.")
    df['AveOccup'] = (df['AveOccup'] - mean_aveoccup) / std_aveoccup
    return df

def create_rooms_per_person(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a new column 'RoomsPerPerson' by dividing 'AveRooms' by 'Population'.
    """
    if 'AveRooms' not in df.columns or 'Population' not in df.columns:
        raise ValueError("Columns 'AveRooms' and/or 'Population' not found in DataFrame.")
    if (df['Population'] == 0).any():
        raise ValueError("Zero value(s) found in 'Population', cannot divide by zero.")
    df['RoomsPerPerson'] = df['AveRooms'] / df['Population']
    return df

def calculate_pearson_corr(df: pd.DataFrame) -> float:
    """
    Calculate the Pearson correlation coefficient between 'MedianHouseValue' and 'RoomsPerPerson'.
    """
    if 'MedianHouseValue' not in df.columns or 'RoomsPerPerson' not in df.columns:
        raise ValueError("Columns 'MedianHouseValue' and/or 'RoomsPerPerson' not found in DataFrame.")
    col1 = df['MedianHouseValue']
    col2 = df['RoomsPerPerson']
    corr = col1.corr(col2)
    return corr

def mean_std_medianhousevalue(df: pd.DataFrame) -> Tuple[float, float]:
    """
    Compute the mean and standard deviation of the 'MedianHouseValue' column.
    """
    if 'MedianHouseValue' not in df.columns:
        raise ValueError("Column 'MedianHouseValue' not found in DataFrame.")
    mean_val = df['MedianHouseValue'].mean()
    std_val = df['MedianHouseValue'].std()
    return mean_val, std_val

def main() -> None:
    df = load_dataset('my_test_01.csv')
    df = replace_missing_medinc(df)
    df = standardize_aveoccup(df)
    df = create_rooms_per_person(df)
    pearson_corr = calculate_pearson_corr(df)
    mean_mhv, std_mhv = mean_std_medianhousevalue(df)
    print(f"Pearson correlation between MedianHouseValue and RoomsPerPerson: {pearson_corr}")
    print(f"Mean of MedianHouseValue: {mean_mhv}")
    print(f"Standard deviation of MedianHouseValue: {std_mhv}")

if __name__ == "__main__":
    main()

Pearson correlation coefficient between MedianHouseValue and RoomsPerPerson: 0.0382234007362979
Mean of MedianHouseValue: 2.1225820930232557
Standard deviation of MedianHouseValue: 1.2209690905546158


In [14]:
df_merged.loc[14, 'common_answers']

[['pearson_coefficient', '0.0382'], ['mean_value', '2.1226']]

In [15]:
df_merged.loc[14, 'file_name']

'my_test_01.csv'