In [3]:
import os
import sys
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Optional, List, Dict, Any
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from Model_Utils.feature_sampling import SamplingFactory
from Model_Utils.feature_selection_extraction import FeatureFactory
from Common_Utils import setup_logger, track_performance, CustomException, load_yaml

In [6]:
X_train = pd.read_csv("Data/raw_data/preprocessed_data/X_train.csv")
X_val = pd.read_csv("Data/raw_data/preprocessed_data/X_val.csv")
X_test = pd.read_csv("Data/raw_data/preprocessed_data/X_test.csv")
y_train = pd.read_csv("Data/raw_data/preprocessed_data/y_train.csv")
y_val = pd.read_csv("Data/raw_data/preprocessed_data/y_val.csv")
y_test = pd.read_csv("Data/raw_data/preprocessed_data/y_test.csv")

encoder_scaler = joblib.load("Tuned_Model/encoder_scaler.joblib")
X_train_encoded = encoder_scaler.transform(X_train)
X_val_encoded   = encoder_scaler.transform(X_val)
X_test_encoded  = encoder_scaler.transform(X_test)

# Get feature names
feature_names = encoder_scaler.get_feature_names_out()

# Convert to DataFrames with same index
X_train_encoded = pd.DataFrame(X_train_encoded, columns=feature_names, index=X_train.index)
X_val_encoded   = pd.DataFrame(X_val_encoded,   columns=feature_names, index=X_val.index)
X_test_encoded  = pd.DataFrame(X_test_encoded,  columns=feature_names, index=X_test.index)

sampler = SamplingFactory.get_sampler("smote")
X_train_sampled, y_train_sampled = sampler.fit_resample(X_train_encoded, y_train)

selector = FeatureFactory.get_processor("selection", "selectkbest", k=20)
X_train_selected = selector.fit_transform(X_train_sampled, y_train_sampled.values.ravel())
selected_columns = X_train_sampled.columns[selector.get_support()].tolist()
X_val_selected = selector.transform(X_val_encoded)
X_test_selected = selector.transform(X_test_encoded)
X_train_selected = pd.DataFrame(X_train_selected, columns=selected_columns, index=X_train_sampled.index)
X_val_selected = pd.DataFrame(X_val_selected, columns=selected_columns, index=X_val_encoded.index)
X_test_selected = pd.DataFrame(X_test_selected, columns=selected_columns, index=X_test_encoded.index)

extractor = FeatureFactory.get_processor("extraction", "kernelpca", n_components=15)
X_train_extracted = extractor.fit_transform(X_train_selected)
X_val_extracted  = extractor.transform(X_val_selected)
X_test_extracted = extractor.transform(X_test_selected)