In [None]:
import pandas as pd
import sys
from pathlib import Path

current_dir = Path.cwd()
src_directory = None
for parent in [current_dir] + list(current_dir.parents)[:2]:
    potential_src = parent / "src"
    if potential_src.exists():
        src_directory = potential_src
        break
if src_directory:
    if str(src_directory) not in sys.path:
        sys.path.append(str(src_directory))
    print(f"Successfully added '{src_directory}' to sys.path")
else:
    print(f"Error: Could not find 'src' directory starting from {current_dir}")

from income_predict import preprocessing, cleaning

In [None]:
# Load the dataframe from the CSV file
df_raw = pd.read_csv('data/census_income.csv')

# Display the first few rows
print(df_raw.head())


In [None]:
df_raw.info()
df_raw.describe()

In [None]:
#1. Describe your data.
preprocessing.get_data_description(df_raw)

In [None]:
#2. What is the distribution of the target variable?
preprocessing.get_target_distribution(df_raw, 'income')


In [None]:
#3. Do we face outliers and missing values?
preprocessing.get_outliers_summary(df_raw)

In [None]:
# Cleaning pipeline i.e.:
# - Clean column names (e.g. 'capital-gain' -> 'capital_gain')
# - Remove redundant columns
# - clean an binarize income data (1 for '>50K', 0 for '=<50K')
# - Replace '?' with NaN
# - trim whitespace from all str values
df_clean = cleaning.full_clean(df_raw)
df_clean.columns.tolist()

In [None]:
#4. How do specific features correlate with the target variable?
preprocessing.get_feature_correlations(df_clean, 'high_income')