# Imports

In [None]:
from pathlib import Path
import sys
import yaml
import joblib
import pandas as pd

# Add parent directory to path for imports
ROOT = Path().resolve().parent
SAVE_DIR = ROOT / "saved"
sys.path.append(str(ROOT))

# Load YAML config
with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Extract drop_features from config
drop_features = config["preprocessing"]["drop_features"]
label = config["general"]["label"]
value_mappings = config["preprocessing"]["value_mappings"]

# Load data
X = joblib.load(SAVE_DIR / "X.pkl")
y = joblib.load(SAVE_DIR / "y.pkl")
X_train = joblib.load(SAVE_DIR / "X_train.pkl")

# Functions
from utils.utils import (
    show_categorical_uniques,
    show_missing_data,
    cleaning_pipeline,
    preprocessor
)

# Visualizations
from shared.visualizations import (
    plot_bar,
    plot_box,
    plot_correlation_heatmap,
    plot_histogram,
    plot_line,
    plot_pie,
    plot_scatter,
)

X_train_cleaned = cleaning_pipeline.fit_transform(X_train)

X_train_preprocessed = pd.DataFrame(
    preprocessor.fit_transform(X_train_cleaned),
    columns=preprocessor.get_feature_names_out()
)

# EDA

In [None]:
X_train_preprocessed.head(20)

In [None]:
# Print every feature name
print(X_train_preprocessed.columns.tolist())

In [None]:
# See unique values for all categorical features
show_categorical_uniques(X_train_preprocessed)

In [None]:
show_missing_data(X_train_preprocessed)

In [None]:
corr_matrix = pd.get_dummies(X_train_preprocessed).corr()
plot_correlation_heatmap(corr_matrix, abs=True)

In [None]:
# plot_pie(df, 'Feature', 'Title')
# plot_histogram(df, 'Feature', 'Title', 'xLabel', 'yLabel', bins=10)
# plot_bar(df, 'Feature', 'Title', 'xLabel', 'yLabel')
# plot_line(df, 'xColumn', 'yColumn', 'Title', 'xLabel', 'yLabel')
# plot_box(df, 'Feature', 'catColumn', 'Title', 'xLabel', 'yLabel')
# plot_scatter(df, 'xColumn', 'yColumn', 'Title', 'xLabel', 'yLabel')
# df['Feature'].unique()