In [1]:
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

load_dotenv(override=True)

questions = Path(os.getenv("QUESTIONS_FILE"))
answers = Path(os.getenv("ANSWERS_FILE"))
# df_json = pd.read_json(questions, lines=True).to_dict(orient='records')
df_questions = pd.read_json(questions, lines=True)
df_answers = pd.read_json(answers, lines=True)
df_merged = df_answers.merge(df_questions, left_on="id", right_on="id", how='inner')

In [2]:
import io
from dataframe_to_dict import parse_dataframe_info

def df_info_to_json(df):
    buffer = io.StringIO()
    df.info(buf=buffer, show_counts=True)
    df_json = parse_dataframe_info(buffer.getvalue())
    return df_json    

In [3]:

from planner import create_plan

path_prefix = Path("data/InfiAgent-DABench/da-dev-tables/")
for index, row in df_merged.iterrows():
    df = pd.read_csv(path_prefix / row['file_name'])
    df_json = df_info_to_json(df)
    plan = create_plan(row['question'], df_json)
    df_merged.at[index, 'plan'] = plan.model_dump_json()


In [4]:
df_merged.to_csv("data/merged_with_plans.csv", index=False)

In [5]:
from coder import create_code

path_prefix = Path("data/InfiAgent-DABench/da-dev-tables/")
for index, row in df_merged.iterrows():
    df = pd.read_csv(path_prefix / row['file_name'])
    df_json = df_info_to_json(df)    
    code = create_code(row['plan'], row['question'], df_json)
    df_merged.at[index, 'code'] = code.model_dump_json()



In [6]:
df_merged.to_csv("data/merged_with_code.csv", index=False)

In [12]:
import json
print(json.loads(df_merged.loc[14, 'code'])['code'])

import pandas as pd
import numpy as np
from typing import Tuple

def load_dataset(filepath: str = "data.csv") -> pd.DataFrame:
    """
    Load the dataset from a CSV file into a Pandas DataFrame.
    """
    df = pd.read_csv(filepath)
    expected_columns = [
        "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population",
        "AveOccup", "Latitude", "Longitude", "MedianHouseValue"
    ]
    if not all(col in df.columns for col in expected_columns):
        raise ValueError("CSV file missing one or more required columns.")
    return df

def replace_missing_medinc_with_mean(df: pd.DataFrame) -> pd.DataFrame:
    """
    Identify missing values (NaN) in 'MedInc' and replace them with the mean value of 'MedInc'.
    """
    if 'MedInc' not in df.columns:
        raise ValueError("Column 'MedInc' not found in DataFrame.")
    mean_medinc = df['MedInc'].mean()
    df['MedInc'] = df['MedInc'].fillna(mean_medinc)
    return df

def standardize_aveoccup(df: pd.DataFrame) -> pd.DataFr

In [None]:
import pandas as pd
import numpy as np
from typing import Tuple

def load_dataset(filepath: str = "data.csv") -> pd.DataFrame:
    """
    Load the dataset from a CSV file into a Pandas DataFrame.
    """
    df = pd.read_csv(filepath)
    expected_columns = [
        "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population",
        "AveOccup", "Latitude", "Longitude", "MedianHouseValue"
    ]
    if not all(col in df.columns for col in expected_columns):
        raise ValueError("CSV file missing one or more required columns.")
    return df

def replace_missing_medinc_with_mean(df: pd.DataFrame) -> pd.DataFrame:
    """
    Identify missing values (NaN) in 'MedInc' and replace them with the mean value of 'MedInc'.
    """
    if 'MedInc' not in df.columns:
        raise ValueError("Column 'MedInc' not found in DataFrame.")
    mean_medinc = df['MedInc'].mean()
    df['MedInc'] = df['MedInc'].fillna(mean_medinc)
    return df

def standardize_aveoccup(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize the 'AveOccup' column using z-scores (mean=0, std=1).
    """
    if 'AveOccup' not in df.columns:
        raise ValueError("Column 'AveOccup' not found in DataFrame.")
    mean = df['AveOccup'].mean()
    std = df['AveOccup'].std()
    if std == 0:
        raise ValueError("Standard deviation of 'AveOccup' is zero, cannot standardize.")
    df['AveOccup'] = (df['AveOccup'] - mean) / std
    return df

def create_rooms_per_person(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a new column 'RoomsPerPerson' as AveRooms divided by Population.
    """
    if 'AveRooms' not in df.columns or 'Population' not in df.columns:
        raise ValueError("Columns 'AveRooms' and/or 'Population' not found in DataFrame.")
    if (df['Population'] == 0).any():
        raise ValueError("Zero values found in 'Population', cannot divide by zero.")
    df['RoomsPerPerson'] = df['AveRooms'] / df['Population']
    return df

def calculate_pearson_corr(df: pd.DataFrame) -> float:
    """
    Compute the Pearson correlation coefficient between 'MedianHouseValue' and 'RoomsPerPerson'.
    """
    if 'MedianHouseValue' not in df.columns or 'RoomsPerPerson' not in df.columns:
        raise ValueError("Columns 'MedianHouseValue' and/or 'RoomsPerPerson' not found in DataFrame.")
    corr = df['MedianHouseValue'].corr(df['RoomsPerPerson'])
    return corr

def calculate_mean_std_medianhousevalue(df: pd.DataFrame) -> Tuple[float, float]:
    """
    Calculate and return the mean and standard deviation of the 'MedianHouseValue' column.
    """
    if 'MedianHouseValue' not in df.columns:
        raise ValueError("Column 'MedianHouseValue' not found in DataFrame.")
    mean = df['MedianHouseValue'].mean()
    std = df['MedianHouseValue'].std()
    return mean, std

def main() -> None:
    """
    Execute the data preprocessing and analysis pipeline as described in the plan.
    """
    # Task 1: Load the dataset
    file_path = "my_test_01.csv"
    df = load_dataset(file_path)

    # Task 2: Replace missing values in MedInc with the mean
    df = replace_missing_medinc_with_mean(df)

    # Task 3: Standardize the AveOccup column using z-scores
    df = standardize_aveoccup(df)

    # Task 4: Create RoomsPerPerson feature
    df = create_rooms_per_person(df)

    # Task 5: Calculate Pearson correlation between MedianHouseValue and RoomsPerPerson
    pearson_corr = calculate_pearson_corr(df)
    print(f"Pearson correlation coefficient between MedianHouseValue and RoomsPerPerson: {pearson_corr}")

    # Task 6: Calculate mean and standard deviation of MedianHouseValue from the original data
    original_df = pd.read_csv(file_path)
    mean_mhv, std_mhv = calculate_mean_std_medianhousevalue(original_df)
    print(f"Mean of MedianHouseValue: {mean_mhv}")
    print(f"Standard deviation of MedianHouseValue: {std_mhv}")

main()


Pearson correlation coefficient between MedianHouseValue and RoomsPerPerson: 0.0382234007362979
Mean of MedianHouseValue: 2.1225820930232557
Standard deviation of MedianHouseValue: 1.2209690905546158


In [14]:
df_merged.loc[14, 'common_answers']

[['pearson_coefficient', '0.0382'], ['mean_value', '2.1226']]

In [15]:
df_merged.loc[14, 'file_name']

'my_test_01.csv'