# **SETUP**

In [1]:
# Load dotenv
import os, pathlib
from dotenv import load_dotenv, set_key
load_dotenv()

# Store project root in .env
PROJECT_ROOT = pathlib.Path().resolve()
ENV_PATH = PROJECT_ROOT / ".env"
if not ENV_PATH.exists():
    raise FileNotFoundError(f".env file not found.")
    
if "PROJECT_ROOT" not in os.environ:
    set_key(str(ENV_PATH), "PROJECT_ROOT", str(PROJECT_ROOT))
else:
    PROJECT_ROOT = pathlib.Path(os.environ["PROJECT_ROOT"])

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

In [2]:
# Import libraries
import pandas as pd
import numpy as np

# Import route_query function
import import_ipynb
from route_query import route_query

# **USER ACTIONS**
<hr>

- user_load_dataset(): prompts user to pick csv to load into dataframe
- user_actions(): prompts user to perform actions on dataframe

In [3]:
def user_load_dataset():
    # Check dataset folder
    datasets_path = "datasets"
    if not os.path.exists(datasets_path):
        print(f"'{datasets_path}' folder not found.")
        return
    
    print("LOAD YOUR DATASET TO GET STARTED!")
    # List existing CSVs in dataset folder
    file_paths = [f for f in os.listdir(datasets_path) if f.lower().endswith('.csv')]
    if file_paths:
        print(f"CSV files in '{datasets_path}':")
        for i, fname in enumerate(file_paths):
            print(f"{i+1}. {fname}")
    else:
        print("No CSV files found in 'datasets' folder.")
        return

    print("\nNot seeing your data? Make sure your CSV file is in 'datasets' folder")
    print("Type x to exit.")
    user_file = input("To load your dataset, input either the number or name of your desired CSV: ")

    if user_file.lower() == "x":
        print("Exiting.")
        return
    
    # Try to load by number
    if user_file.isnumeric():
        idx = int(user_file) - 1
        if 0 <= idx < len(file_paths):
            selected_file = file_paths[idx]
        else:
            print("Invalid number.")
            return
    # Not numeric try to load by name
    else:
        if user_file in file_paths:
            selected_file = user_file
        else:
            print("File not found.")
            return

    df = pd.read_csv(os.path.join(datasets_path, selected_file))
    print(f"\nLoaded '{selected_file}' with shape {df.shape}")
    print(df.head(5))
    return df

In [None]:
example_query = """Here is the query example that you could use:
"Give me a summary of all numerical columns"
"Find cardinality of categorical columns"
"Show me descriptive statistics for 'price' and 'age' columns"
"Handle missing values in my dataset"
"Suggest ways to impute missing data"
"Fill missing values with appropriate methods"
"Find duplicate rows"
"Remove duplicate records keeping the first occurrence"
"Show me similar records in customer data"
"Analyze duplicate patterns in the dataset"
"Flag duplicates for review before removal"
"Optimize data types to save memory"
"Convert date columns automatically"
"Fix percentage columns"
"Analyze current data types"
"Convert to categorical types"
"Detect boolean columns"
"Show memory usage comparison"
"Find outliers in price column"
"Remove extreme values using IQR method"
"Show me outlier patterns in the dataset"
"Cap outliers using Z-score method"
"Analyze outliers across all numeric columns"
"Handle outliers with log transformation"
"Clean text columns"
"Standardize city names"
"Extract numbers from product codes"
"Convert text to title case"
"Remove special characters from names"
"Validate email format patterns"
"Validate email addresses"
"Check if ages are in reasonable range"
"Find data quality issues"
"Validate phone number formats"
"Check categorical values against allowed list"
"Generate data validation report"
"Create age groups in 10-year bins"
"Calculate price per square foot ratio"
"Generate interaction features between income and education"
"Add 1 and 2 period lag features for sales"
"Create 7-day rolling average"
"Generate quadratic polynomial features"
"Normalize numeric columns"
"Scale features for machine learning"
"One-hot encode categorical variables"
"Handle rare categories in city column"
"Create dummy variables without multicollinearity"
"Prepare dataset for machine learning"
"Apply robust scaling for outlier-resistant standardization"
"Export to Excel with proper formatting"
"Create a comprehensive data dictionary"
"Generate detailed summary report"
"Export to CSV with custom encoding"
"Create codebook for documentation"
"Export to JSON with indentation"
"Save data to multiple Excel sheets"
"Generate a comprehensive data profile report"
"Calculate data quality metrics"
"Analyze correlations in the data"
"Detect patterns in the data"
"Find multicollinearity issues"
"Assess overall data quality score"
"""
def user_actions(df):
    while True:
        print("-----------------------------------------------------------------")
        print("What do you want the agent to perform?")
        print("1. View example queries")
        print("x. Exit")
        print("-----------------------------------------------------------------")
        user_query = input()
        if user_query.lower() == "x":
            print("Exiting.")
            return df
        elif user_query == "1":
            print(example_query)
        else:
            # Calls route_query
            df = route_query(user_query, df)

**MAIN FUNCTION**
-

In [5]:
def main():
    df = user_load_dataset()
    while df is None:
        df = user_load_dataset()
    df = user_actions(df)
    print("========== FINAL DF ==========")
    print(df)

**Testing**

In [6]:
main()

LOAD YOUR DATASET TO GET STARTED!
CSV files in 'datasets':
1. Life Expectancy Data.csv
2. household_vista_2023_2024.csv
3. smoke.csv

Not seeing your data? Make sure your CSV file is in 'datasets' folder
Type x to exit.

Loaded 'Life Expectancy Data.csv' with shape (2938, 22)
       Country  Year      Status  Life expectancy   Adult Mortality  \
0  Afghanistan  2015  Developing              65.0            263.0   
1  Afghanistan  2014  Developing              59.9            271.0   
2  Afghanistan  2013  Developing              59.9            268.0   
3  Afghanistan  2012  Developing              59.5            272.0   
4  Afghanistan  2011  Developing              59.2            275.0   

   infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   ...  \
0             62     0.01               71.279624         65.0      1154  ...   
1             64     0.01               73.523582         62.0       492  ...   
2             66     0.01               73.219243    