# **SETUP**

In [1]:
# Load dotenv
import os, pathlib
from dotenv import load_dotenv, set_key
load_dotenv()

# Store project root in .env
PROJECT_ROOT = pathlib.Path().resolve()
ENV_PATH = PROJECT_ROOT / ".env"
if not ENV_PATH.exists():
    raise FileNotFoundError(f".env file not found.")
    
if "PROJECT_ROOT" not in os.environ:
    set_key(str(ENV_PATH), "PROJECT_ROOT", str(PROJECT_ROOT))
else:
    PROJECT_ROOT = pathlib.Path(os.environ["PROJECT_ROOT"])

# Get API Key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

In [2]:
# Import libraries
import pandas as pd
import numpy as np

# Import route_query function
import import_ipynb
from route_query import route_query

# **USER ACTIONS**
<hr>

- user_load_dataset(): prompts user to pick csv to load into dataframe
- user_actions(): prompts user to perform actions on dataframe

In [3]:
def user_load_dataset():
    # Check dataset folder
    datasets_path = "datasets"
    if not os.path.exists(datasets_path):
        print(f"'{datasets_path}' folder not found.")
        return
    
    print("LOAD YOUR DATASET TO GET STARTED!")
    # List existing CSVs in dataset folder
    file_paths = [f for f in os.listdir(datasets_path) if f.lower().endswith('.csv')]
    if file_paths:
        print(f"CSV files in '{datasets_path}':")
        for i, fname in enumerate(file_paths):
            print(f"{i+1}. {fname}")
    else:
        print("No CSV files found in 'datasets' folder.")
        return

    print("\nNot seeing your data? Make sure your CSV file is in 'datasets' folder")
    print("Type x to exit.")
    user_file = input("To load your dataset, input either the number or name of your desired CSV: ")

    if user_file.lower() == "x":
        print("Exiting.")
        return
    
    # Try to load by number
    if user_file.isnumeric():
        idx = int(user_file) - 1
        if 0 <= idx < len(file_paths):
            selected_file = file_paths[idx]
        else:
            print("Invalid number.")
            return
    # Not numeric try to load by name
    else:
        if user_file in file_paths:
            selected_file = user_file
        else:
            print("File not found.")
            return

    df = pd.read_csv(os.path.join(datasets_path, selected_file))
    print(f"\nLoaded '{selected_file}' with shape {df.shape}")
    print(df.head(5))
    return df

In [4]:
def user_actions(df):
    out_path = "datasets_out"
    while True:
        print("\nCommands:")
        print("1. View dataframe (head)")
        print("2. Output dataframe to CSV")
        print("x. Exit")
        user_query = input("\nEnter a command number or a data cleaning/query command: ")
        if user_query.lower() == "x":
            print("Exiting.")
            return df
        elif user_query == "1":
            print(df.head())
            _ = input("enter anything to continue")
        elif user_query == "2":
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            out_file = os.path.join(out_path, "output.csv")
            df.to_csv(out_file, index=False)
            print(f"Dataframe saved to {out_file}")
            _ = input("enter anything to continue")
        else:
            # Calls route_query
            df = route_query(user_query, df)

**MAIN FUNCTION**
-

In [5]:
def main():
    df = user_load_dataset()
    while df is None:
        df = user_load_dataset()
    df = user_actions(df)
    print("========== FINAL DF ==========")
    print(df)

**Testing**

In [6]:
main()

LOAD YOUR DATASET TO GET STARTED!
CSV files in 'datasets':
1. Life Expectancy Data.csv
2. household_vista_2023_2024.csv
3. smoke.csv

Not seeing your data? Make sure your CSV file is in 'datasets' folder
Type x to exit.

Loaded 'Life Expectancy Data.csv' with shape (2938, 22)
       Country  Year      Status  Life expectancy   Adult Mortality  \
0  Afghanistan  2015  Developing              65.0            263.0   
1  Afghanistan  2014  Developing              59.9            271.0   
2  Afghanistan  2013  Developing              59.9            268.0   
3  Afghanistan  2012  Developing              59.5            272.0   
4  Afghanistan  2011  Developing              59.2            275.0   

   infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   ...  \
0             62     0.01               71.279624         65.0      1154  ...   
1             64     0.01               73.523582         62.0       492  ...   
2             66     0.01               73.219243    

Failed to multipart ingest runs: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')


df = missing_vals(df, "detect missing values")
                                   dtype  missing_pct  \
Life expectancy                  float64        0.003   
Adult Mortality                  float64        0.003   
Alcohol                          float64        0.066   
Hepatitis B                      float64        0.188   
 BMI                             float64        0.012   
Polio                            float64        0.006   
Total expenditure                float64        0.077   
Diphtheria                       float64        0.006   
GDP                              float64        0.152   
Population                       float64        0.222   
 thinness  1-19 years            float64        0.012   
 thinness 5-9 years              float64        0.012   
Income composition of resources  float64        0.057   
Schooling                        float64        0.055   

                                                 suggestion  
Life expectancy                    

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')


df = missing_vals(df, "impute missing values")


Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')


Imputed 'Year' with mean
Status has low Cardinality, imputed with mode
Imputed 'Life expectancy ' with mean
'Adult Mortality' is skewed, imputed with median
'infant deaths' is skewed, imputed with median
Imputed 'Alcohol' with mean
'percentage expenditure' is skewed, imputed with median
'Hepatitis B' is skewed, imputed with median
'Measles ' is skewed, imputed with median
Imputed ' BMI ' with mean
'under-five deaths ' is skewed, imputed with median
'Polio' is skewed, imputed with median
Imputed 'Total expenditure' with mean
'Diphtheria ' is skewed, imputed with median
' HIV/AIDS' is skewed, imputed with median
'GDP' is skewed, imputed with median
'Population' is skewed, imputed with median
' thinness  1-19 years' is skewed, imputed with median
' thinness 5-9 years' is skewed, imputed with median
'Income composition of resources' is skewed, imputed with median
Imputed 'Schooling' with mean
Automatic imputation has been performed on the DataFrame. All missing values have been handled acc

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')


print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2938 non-null   float64
 4   Adult Mortality                  2938 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2938 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2938 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2938 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Pol

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')
Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')


df = duplicates(df, "find duplicate rows")


Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')


No exact duplicates found.
Duplicate rows have been identified. Here are the details:
 Empty DataFrame
Columns: []
Index: []

Commands:
1. View dataframe (head)
2. Output dataframe to CSV
x. Exit


Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Forbidden"}\n')


Exiting.
          Country  Year      Status  Life expectancy   Adult Mortality  \
0     Afghanistan  2015  Developing              65.0            263.0   
1     Afghanistan  2014  Developing              59.9            271.0   
2     Afghanistan  2013  Developing              59.9            268.0   
3     Afghanistan  2012  Developing              59.5            272.0   
4     Afghanistan  2011  Developing              59.2            275.0   
...           ...   ...         ...               ...              ...   
2933     Zimbabwe  2004  Developing              44.3            723.0   
2934     Zimbabwe  2003  Developing              44.5            715.0   
2935     Zimbabwe  2002  Developing              44.8             73.0   
2936     Zimbabwe  2001  Developing              45.3            686.0   
2937     Zimbabwe  2000  Developing              46.0            665.0   

      infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   \
0                62    