# Imports

In [None]:
from pathlib import Path
import sys
import yaml
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Add parent directory to path for imports
ROOT = Path().resolve().parent
SAVE_DIR = ROOT / "saved"
sys.path.append(str(ROOT))

# Load YAML config
with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Extract drop_features from config
label = config["general"]["label"]
mode = config["general"]["mode"]
drop_features = config["preprocessing"]["drop_features"]
value_mappings = config["preprocessing"]["value_mappings"]

# Load data
X = joblib.load(SAVE_DIR / "X.pkl")
y = joblib.load(SAVE_DIR / "y.pkl")
X_train = joblib.load(SAVE_DIR / "X_train.pkl")

# Functions
from utils.utils import show_categorical_uniques, show_missing_data

# Visualizations
from utils.visualizations import plot_correlation_heatmap

# EDA

In [None]:
X_train.head(10)

In [None]:
# Print every feature name
print(X_train.columns.tolist())

In [None]:
# See unique values for all categorical features
show_categorical_uniques(X_train)
show_missing_data(X_train)

In [None]:
if mode == "text":
    print("Text mode: showing top terms by frequency")
    vec = CountVectorizer(stop_words="english", max_features=20)
    X_vec = vec.fit_transform(X_train["text"])
    term_freq = pd.Series(X_vec.toarray().sum(axis=0), index=vec.get_feature_names_out())
    display(term_freq.sort_values(ascending=False).head(20))

In [None]:
corr_matrix = pd.get_dummies(X_train.drop(columns=drop_features)).corr()
plot_correlation_heatmap(corr_matrix)

In [None]:
# plot_pie(df, 'Feature', 'Title')
# plot_histogram(df, 'Feature', 'Title', 'xLabel', 'yLabel', bins=10)
# plot_bar(df, 'Feature', 'Title', 'xLabel', 'yLabel')
# plot_line(df, 'xColumn', 'yColumn', 'Title', 'xLabel', 'yLabel')
# plot_box(df, 'Feature', 'catColumn', 'Title', 'xLabel', 'yLabel')
# plot_scatter(df, 'xColumn', 'yColumn', 'Title', 'xLabel', 'yLabel')
# df['Feature'].unique()