# Imports

In [None]:
from pathlib import Path
import sys
import yaml
import joblib
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

ROOT = Path().resolve().parent
SAVE_DIR = ROOT / "saved"
sys.path.append(str(ROOT))

def find_project_root(marker="requirements.txt"):
    current = Path().resolve()
    for parent in [current] + list(current.parents):
        if (parent / marker).is_file():
            return parent
    raise FileNotFoundError(f"Project root with '{marker}' not found.")
SHARED_ROOT = find_project_root("requirements.txt") / "shared"
sys.path.append(str(SHARED_ROOT))

CONFIG_PATH = ROOT / "config.yaml"
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# Load data
X = joblib.load(SAVE_DIR / "X.pkl")
y = joblib.load(SAVE_DIR / "y.pkl")
X_train = joblib.load(SAVE_DIR / "X_train.pkl")

from utils.utils import auto_config_from_data

# Visualizations
from visualizations import (
    plot_bar,
    plot_box,
    plot_histogram,
    plot_line,
    plot_pie,
    plot_scatter,
)

CONFIG_PATH = ROOT / "config.yaml"
# auto_config_from_data(X, CONFIG_PATH)
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

drop_features = config["preprocessing"]["drop_features"]
label = config["general"]["label"]
value_mappings = config["preprocessing"]["value_mappings"]

from utils.utils import show_categorical_uniques, show_missing_data, cleaning_pipeline, preprocessor

X_train_cleaned = cleaning_pipeline.fit_transform(X_train)

X_train_preprocessed = pd.DataFrame(
    preprocessor.fit_transform(X_train_cleaned),
    columns=preprocessor.get_feature_names_out()
)

# Rename columns for EDA
X_train_preprocessed.columns = [
    col.replace("remainder__", "")
       .replace("onehot__", "oh__")
       .replace("ordinal__", "or__")
    for col in X_train_preprocessed.columns
]

# EDA

In [None]:
X_train_preprocessed.head(10)

In [None]:
# Print every feature name
print(X_train_preprocessed.columns.tolist())

In [None]:
show_missing_data(X_train_preprocessed)

In [None]:
# Compute absolute correlation matrix
corr_matrix = X_train_preprocessed.corr()

# Remove weak correlations and self-correlations
filtered_corr = corr_matrix.where((corr_matrix.abs() > 0.5) & (corr_matrix.abs() < 1.0)).dropna(how='all', axis=0).dropna(how='all', axis=1)

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.heatmap(filtered_corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Strong Feature Correlations (> 0.5)')
plt.show()

In [None]:
# plot_pie(X_train_preprocessed, 'Feature', 'Title')
# plot_histogram(X_train_preprocessed, 'Feature', 'Title', 'xLabel', 'yLabel', bins=10)
# plot_bar(X_train_preprocessed, 'Feature', 'Title', 'xLabel', 'yLabel')
# plot_line(X_train_preprocessed, 'xColumn', 'yColumn', 'Title', 'xLabel', 'yLabel')
# plot_box(X_train_preprocessed, 'Feature', 'catColumn', 'Title', 'xLabel', 'yLabel')
# plot_scatter(X_train_preprocessed, 'xColumn', 'yColumn', 'Title', 'xLabel', 'yLabel')
# X_train_preprocessed['Feature'].value_counts()