In [1]:
import kagglehub
import os
import shutil

import sys
from pathlib import Path

# dataset_id = "uciml/red-wine-quality-cortez-et-al-2009"
# local_dataset_name = dataset_id.split('/')[-1] # Uses the last part of the ID

local_dataset_name = "space-titanic"

base_data_dir = Path("/home/jovyan/data") # Standard for jovyan user
dataset_specific_base_path = base_data_dir / local_dataset_name
raw_data_target_dir = dataset_specific_base_path / "raw"
print(f"Target dataset base path: {dataset_specific_base_path}")
os.makedirs(raw_data_target_dir, exist_ok=True)

# Check if dataset already exists in destination
destination_path = f"/home/jovyan/data/{local_dataset_name}"
if os.path.exists(raw_data_target_dir) and os.listdir(dataset_specific_base_path):
    print(f"Raw dataset already exists at {raw_data_target_dir}")
    print("Skipping download...")
else:
    print(f"Dataset not found locally. Downloading {dataset_id}...")
    download_path = kagglehub.dataset_download(dataset_id)
    print(f"Dataset downloaded to temporary path in container: {download_path}")
    print(f"Copying dataset to persistent raw data volume: {raw_data_target_dir}")
    # Ensure the destination directory exists
    os.makedirs(raw_data_target_dir, exist_ok=True)
    for item in os.listdir(download_path):
        s = os.path.join(download_path, item)
        d = os.path.join(raw_data_target_dir, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks=False, ignore=None, dirs_exist_ok=True)
        else:
            shutil.copy2(s, d)

    print(f"Dataset '{dataset_id}' successfully copied to {raw_data_target_dir} in shared volume.")
print(f"Raw dataset in persistent volume at: {raw_data_target_dir}")


Target dataset base path: /home/jovyan/data/space-titanic
Raw dataset already exists at /home/jovyan/data/space-titanic/raw
Skipping download...
Raw dataset in persistent volume at: /home/jovyan/data/space-titanic/raw


In [2]:
# --- Cell 2: Import Libraries and Load Data ---

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import os
import cProfile # For basic profiling
import pstats # For processing profiling results
import io # For capturing profiling output

def reload_utils():
    import importlib
    import utils.etl
    import red_wine_quality.etl_chain
    import utils.eda
    import utils.eval
    import utils.submission

    importlib.reload(utils.etl)
    importlib.reload(red_wine_quality.etl_chain)
    importlib.reload(utils.eda)
    importlib.reload(utils.eval)
    importlib.reload(utils.submission)
    print("🔁 Reloaded red_wine_quality.etl_chain, utils.etl, utils.eda, utils.eval, utils.submission")

print(f"Listing files in {raw_data_target_dir}:")
try:
    for root, dirs, files in os.walk(raw_data_target_dir):
        level = root.replace(str(raw_data_target_dir), '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print(f'{subindent}{f}')
except FileNotFoundError:
    print(f"Error: Directory not found: {raw_data_target_dir}. Please ensure the dataset was downloaded and copied correctly.")
    # Exit or handle the error appropriately if the directory is not found

csv_files = [f for f in os.listdir(raw_data_target_dir) if f.endswith('.csv')]
if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {raw_data_target_dir}")
if len(csv_files) > 1:
    print(f"Warning: Multiple CSV files found. Using the first one: {csv_files[0]}")

# Load the data
data_file_path = os.path.join(raw_data_target_dir, csv_files[0])
print(f"\nLoading data from: {data_file_path}")
df = pd.read_csv(data_file_path)

print("\nAvailable columns in the RAW dataset:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")



Listing files in /home/jovyan/data/space-titanic/raw:
raw/
    train.csv

Loading data from: /home/jovyan/data/space-titanic/raw/train.csv

Available columns in the RAW dataset:
0: PassengerId
1: HomePlanet
2: CryoSleep
3: Cabin
4: Destination
5: Age
6: VIP
7: RoomService
8: FoodCourt
9: ShoppingMall
10: Spa
11: VRDeck
12: Name
13: Transported


In [3]:
# df.describe()
# df.columns
# df.dtypes
# df.isna().sum()
# df.loc[df.duplicated()]
# df.duplicated().sum()
# df.loc[df.duplicated(subset=['citric acid'])].head(5)
# df = df.loc[~df.duplicated(subset=['citric acid'])] \
#     .reset_index(drop=True).copy()
df.shape
# df.head()
# df["HomePlanet"].unique()

(8693, 14)

In [4]:
n_before = df.shape[0]
n_after = df.dropna().shape[0]
print(f"🧮 {n_after}/{n_before} rows would remain after {n_after - n_before} rows dropna ({100 * n_after/n_before:.2f}%)")

🧮 6606/8693 rows would remain after -2087 rows dropna (75.99%)


In [5]:
# === setting pred target ===
sys.path.append(str(Path().resolve().parent))
from utils.etl import get_raw_dataset, transform_raw_dataframe
import numpy as np
target_column = "Transported"
service_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
X, y = get_raw_dataset(dataset_name=local_dataset_name, target_column=target_column, drop_na = True)
X = transform_raw_dataframe(df = X, service_cols= service_cols)
# === Task Type Inference ===
def detect_task_type(y):
    if pd.api.types.is_numeric_dtype(y):
        if y.nunique() <= 2:
            return "binary_classification"
        elif y.nunique() < 15: #arbitrary 15
            return "multiclass_classification"
        else:
            return "regression"
    elif y.apply(lambda x: isinstance(x, (list, set))).any():
        return "multilabel_classification"
    else:
        return "unknown"
task_type = detect_task_type(y)

print(f"🧠 Target column: {target_column}")
print(f"📊 Target dtype: {df[target_column].dtype}")
print(f"✅ Inferred task type: {task_type}")

[PosixPath('/home/jovyan/data/space-titanic/raw/train.csv')]
Dataset loaded successfully. Only one file loaded
🧠 Target column: Transported
📊 Target dtype: bool
✅ Inferred task type: binary_classification


In [6]:
import sklearn.metrics

perf_eval_metric = "accuracy"  # or "accuracy", "precision", "recall", etc.

# Map of string -> (metric name, metric function)
metric_lookup = {
    "accuracy": ("accuracy", sklearn.metrics.accuracy_score),
    "f1": ("f1_score", lambda y_true, y_pred: sklearn.metrics.f1_score(y_true, y_pred, average="weighted")),
    "precision": ("precision", lambda y_true, y_pred: sklearn.metrics.precision_score(y_true, y_pred, average="weighted")),
    "recall": ("recall", lambda y_true, y_pred: sklearn.metrics.recall_score(y_true, y_pred, average="weighted")),
    "mse": ("mean_squared_error", sklearn.metrics.mean_squared_error),
    "mae": ("mean_absolute_error", sklearn.metrics.mean_absolute_error),
}

# Lookup the function and name
if perf_eval_metric not in metric_lookup:
    raise ValueError(f"Unsupported metric: {perf_eval_metric}")

eval_metric_name, eval_metric_fn = metric_lookup[perf_eval_metric]
print(f"✅ Evaluation metric set to: {eval_metric_name}")


✅ Evaluation metric set to: accuracy


In [None]:
# === Domain knowledge ===
# Any semantic/description of features?
# PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
# HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
# CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
# Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
# Destination - The planet the passenger will be debarking to.
# Age - The age of the passenger.
# VIP - Whether the passenger has paid for special VIP service during the voyage.
# RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
# Name - The first and last names of the passenger.
# Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
# from utils.eda import eda_vis
# # === prelim EDA on raw dataset ===
# eda_vis(X, y, task_type)
# reload_utils()
# from utils.eda import feature_eda_vis
# # === deep EDA for features on ETL-applied dataset ===
# feature_eda_vis(X, y, task_type)

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# import numpy as np
# from scipy import stats
# from statsmodels.stats.proportion import proportion_confint

# # Assume X and y are pandas DataFrames/Series as described by the user.
# # For demonstration, let's create some sample data if X and y are not defined
# # (Remove or comment this out if you have your actual X and y)
# if 'X' not in locals() or 'y' not in locals():
#     print("Generating sample X and y for demonstration purposes...")
#     n_samples = 200
#     X_data = {
#         'HomePlanet': np.random.choice(['Earth', 'Europa', 'Mars', 'PlanetX', 'PlanetY'], size=n_samples), # Added more categories
#         'CryoSleep': np.random.choice([True, False, np.nan], size=n_samples, p=[0.25, 0.7, 0.05]), # Added NaNs
#         'Age': np.random.normal(loc=30, scale=10, size=n_samples).clip(0, 80),
#         'RoomService': np.random.exponential(scale=100, size=n_samples).clip(0, 2000) * np.random.choice([0,1, np.nan], size=n_samples, p=[0.35,0.55, 0.1]), # Some zeros and NaNs
#         'VIP': np.random.choice([True, False, np.nan], size=n_samples, p=[0.05, 0.85, 0.1]) # With NaNs
#     }
#     X = pd.DataFrame(X_data)
#     X['Age'] = X['Age'].astype(float).fillna(X['Age'].median()).astype(int) # Handle potential NaNs from clip then fill
#     X['RoomService'] = X['RoomService'].astype(float)
    
#     # Simulate some dependency for y
#     y_score = X['Age'] * -0.1 + X['RoomService'].fillna(0) * 0.01 + \
#               (X['HomePlanet'] == 'Europa').astype(int) * 20 + \
#               X['CryoSleep'].fillna(False).astype(int) * 30 # Handle NaNs in CryoSleep for scoring
#     y_prob = 1 / (1 + np.exp(- (y_score - y_score.mean()) / y_score.std() )) # Sigmoid
#     y = pd.Series(np.random.binomial(1, y_prob, size=n_samples).astype(bool), name='Transported')


# # --- Data Preparation ---
# y_named = y.copy()
# if not hasattr(y_named, 'name') or y_named.name is None:
#     y_named.name = 'Transported' # Default name if y has no name

# df_combined = X.copy()
# df_combined['Transported_numeric'] = y_named.astype(int)

# CATEGORICAL_THRESHOLD = 20 
# palette = {0: 'skyblue', 1: 'salmon'} 
# legend_labels = {0: 'Not Transported', 1: 'Transported'}
# feature_columns = X.columns

# # --- Main Loop for Plotting ---
# for feature_col in feature_columns:
#     print(f"--- Analyzing Feature: {feature_col} ---")
    
#     fig, axes = plt.subplots(2, 3, figsize=(22, 13)) # Adjusted figsize slightly for better label spacing
#     fig.suptitle(f'Comprehensive Analysis: {feature_col} | Target: {y_named.name}', fontsize=18, y=0.99) # Adjusted y for suptitle
    
#     # --- Row 1: Distribution Visualizations ---
#     ax_1_1 = axes[0, 0]
#     try:
#         plot_data_1_1 = df_combined[[feature_col, 'Transported_numeric']].dropna(subset=[feature_col])
#         if not plot_data_1_1.empty and plot_data_1_1[feature_col].nunique() > 0 :
#             sns.stripplot(data=plot_data_1_1, x=feature_col, y='Transported_numeric', hue='Transported_numeric',
#                           jitter=0.25, dodge=True, ax=ax_1_1, palette=palette, legend=False, alpha=0.6)
#             ax_1_1.set_title('Point Distribution by Target')
#             ax_1_1.set_ylabel(f'{y_named.name} (0=F, 1=T)')
#             ax_1_1.set_yticks([0, 1])
#             ax_1_1.set_yticklabels(['False', 'True'])
#             ax_1_1.set_xlabel(feature_col)
#             handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=palette[i], markersize=10) for i in palette]
#             ax_1_1.legend(handles, [legend_labels[i] for i in palette], title=y_named.name, loc='best')
            
#             # Rotate x-axis labels if needed
#             if plot_data_1_1[feature_col].nunique() > 5 and plot_data_1_1[feature_col].dtype == 'object': # More categories
#                  ax_1_1.tick_params(axis='x', rotation=45, labelbottom=True)
#                  plt.setp(ax_1_1.get_xticklabels(), ha="right", rotation_mode="anchor")
#             elif plot_data_1_1[feature_col].nunique() > 10 and plot_data_1_1[feature_col].dtype != 'object': # Numerical with many ticks
#                  ax_1_1.tick_params(axis='x', rotation=30, labelbottom=True)
#                  plt.setp(ax_1_1.get_xticklabels(), ha="right", rotation_mode="anchor")
#             else: # Fewer categories or numerical, no rotation or default handling
#                  ax_1_1.tick_params(axis='x', labelbottom=True)

#         else:
#             ax_1_1.text(0.5, 0.5, "No data or no variance\nafter NaN drop", ha='center', va='center', transform=ax_1_1.transAxes)
#             ax_1_1.set_title('Point Distribution (No Data)')
#     except Exception as e:
#         ax_1_1.set_title('Point Distribution (Error)')
#         ax_1_1.text(0.5, 0.5, f"Plot failed: {e}", ha='center', va='center', transform=ax_1_1.transAxes, wrap=True)
#         print(f"  Error in Plot 1.1 for {feature_col}: {e}")

#     ax_1_2 = axes[0, 1]
#     try:
#         plot_data_1_2 = df_combined[[feature_col, 'Transported_numeric']].dropna(subset=[feature_col])
#         if not plot_data_1_2.empty and plot_data_1_2[feature_col].nunique() > 0:
#             sns.histplot(data=plot_data_1_2, x=feature_col, hue='Transported_numeric', 
#                          multiple='layer', kde=False, ax=ax_1_2, palette=palette, 
#                          stat="density", common_norm=False, alpha=0.6, legend=True)
#             ax_1_2.set_title('Normalized Histogram by Target')
#             ax_1_2.set_xlabel(feature_col)
#             handles_hist, labels_hist = ax_1_2.get_legend_handles_labels()
#             try: 
#                 labels_hist_descriptive = [legend_labels[int(float(l))] for l in labels_hist]
#                 ax_1_2.legend(handles_hist, labels_hist_descriptive, title=y_named.name)
#             except (ValueError, KeyError): 
#                  ax_1_2.legend(title=y_named.name)
#         else:
#             ax_1_2.text(0.5, 0.5, "No data or no variance\nafter NaN drop", ha='center', va='center', transform=ax_1_2.transAxes)
#             ax_1_2.set_title('Histogram (No Data)')
#     except Exception as e:
#         ax_1_2.set_title('Histogram (Error)')
#         ax_1_2.text(0.5, 0.5, f"Plot failed: {e}", ha='center', va='center', transform=ax_1_2.transAxes, wrap=True)
#         print(f"  Error in Plot 1.2 for {feature_col}: {e}")

#     ax_1_3 = axes[0, 2]
#     try:
#         temp_df_combined = df_combined.dropna(subset=[feature_col]) 
#         if not temp_df_combined.empty and temp_df_combined[feature_col].nunique() > 0:
#             is_categorical_like = temp_df_combined[feature_col].dtype == 'object' or \
#                                   temp_df_combined[feature_col].nunique() < CATEGORICAL_THRESHOLD
            
#             target_0_data = temp_df_combined[temp_df_combined['Transported_numeric'] == 0][feature_col]
#             target_1_data = temp_df_combined[temp_df_combined['Transported_numeric'] == 1][feature_col]

#             if not target_0_data.empty and not target_1_data.empty:
#                 if is_categorical_like:
#                     props_0 = target_0_data.value_counts(normalize=True)
#                     props_1 = target_1_data.value_counts(normalize=True)
#                     all_categories = sorted(list(set(props_0.index) | set(props_1.index)))
#                     props_0 = props_0.reindex(all_categories, fill_value=0)
#                     props_1 = props_1.reindex(all_categories, fill_value=0)
#                     diff_props = props_1 - props_0
#                     diff_props.plot(kind='bar', ax=ax_1_3, color=['tomato' if x < 0 else 'mediumseagreen' for x in diff_props.values]) # Removed rot here
#                     ax_1_3.tick_params(axis='x', rotation=45) # Apply rotation
#                     plt.setp(ax_1_3.get_xticklabels(), ha='right', rotation_mode='anchor') # Ensure alignment
#                     ax_1_3.set_ylabel('Prop(Y=1) - Prop(Y=0)')
#                 else: 
#                     min_val = min(target_0_data.min(), target_1_data.min())
#                     max_val = max(target_0_data.max(), target_1_data.max())
#                     if pd.isna(min_val) or pd.isna(max_val) or min_val == max_val: # Handle NaN or single value case
#                         bins = np.array([temp_df_combined[feature_col].min() - 0.5, temp_df_combined[feature_col].max() + 0.5]) if temp_df_combined[feature_col].nunique() > 0 else np.array([0,1])
#                     else: 
#                         bins = np.linspace(min_val, max_val, 11)
#                     hist_0, _ = np.histogram(target_0_data.dropna(), bins=bins, density=True) # dropna within histogram
#                     hist_1, _ = np.histogram(target_1_data.dropna(), bins=bins, density=True) # dropna within histogram
#                     diff_hist = hist_1 - hist_0
#                     bin_centers = (bins[:-1] + bins[1:]) / 2
#                     bar_width = bins[1] - bins[0] if len(bins) > 1 else 1
#                     ax_1_3.bar(bin_centers, diff_hist, width=bar_width * 0.9, 
#                                 color=['tomato' if x < 0 else 'mediumseagreen' for x in diff_hist])
#                     ax_1_3.set_ylabel('Density(Y=1) - Density(Y=0)')
#                 ax_1_3.axhline(0, color='black', lw=0.8, linestyle='--')
#                 ax_1_3.set_title('Outcome Difference by Feature Value')
#                 ax_1_3.set_xlabel(feature_col)
#             else:
#                 ax_1_3.text(0.5, 0.5, "Not enough data in one/both target groups", ha='center', va='center', transform=ax_1_3.transAxes)
#                 ax_1_3.set_title('Outcome Difference (Not enough data)')
#         else:
#             ax_1_3.text(0.5, 0.5, "No data or no variance\nafter NaN drop", ha='center', va='center', transform=ax_1_3.transAxes)
#             ax_1_3.set_title('Outcome Difference (No Data)')
#     except Exception as e:
#         ax_1_3.set_title('Outcome Difference (Error)')
#         ax_1_3.text(0.5, 0.5, f"Plot failed: {e}", ha='center', va='center', transform=ax_1_3.transAxes, wrap=True)
#         print(f"  Error in Plot 1.3 for {feature_col}: {e}")

#     # --- Row 2: Statistical Significance ---
#     ax_2_1 = axes[1, 0] 
#     ax_2_2 = axes[1, 1] 
#     ax_2_3 = axes[1, 2] 

#     for ax_text in [ax_2_2, ax_2_3]:
#         ax_text.clear()
#         ax_text.axis('off')

#     current_feature_data = df_combined[[feature_col, 'Transported_numeric']].dropna(subset=[feature_col])
#     if current_feature_data.empty or current_feature_data[feature_col].nunique() == 0: # Also check for no variance
#         ax_2_1.text(0.5, 0.5, "No data or no variance\nfor stats after NaN drop", ha='center', va='center', transform=ax_2_1.transAxes)
#         ax_2_1.set_title('Stats (No Data)')
#         ax_2_2.text(0.05, 0.9, "No data or no variance for stats after NaN drop.", fontsize=9, va='top', wrap=True)
#         ax_2_3.text(0.05, 0.9, "No data or no variance for stats after NaN drop.", fontsize=9, va='top', wrap=True)
#         fig.subplots_adjust(hspace=0.6, wspace=0.35, top=0.93, bottom=0.08, left=0.05, right=0.97)
#         plt.show()
#         continue 

#     is_categorical_feature = current_feature_data[feature_col].dtype == 'object' or \
#                              current_feature_data[feature_col].nunique() < CATEGORICAL_THRESHOLD
    
#     stats_summary_text = ""
#     interpretation_text = ""

#     if is_categorical_feature:
#         ax_2_1.set_title(f'Proportion Transported by {feature_col}\n(with 95% CIs)')
#         # Ensure feature_col is treated as string for crosstab if it's not already object (e.g. boolean, int categories)
#         contingency_table = pd.crosstab(current_feature_data[feature_col].astype(str), current_feature_data['Transported_numeric'])
        
#         if contingency_table.shape[0] < 1 or contingency_table.shape[1] < 2 : # Need at least 1 category and 2 outcome classes
#             stats_summary_text += "Chi-squared test not applicable (table too small or one outcome class missing).\n"
#             ax_2_1.text(0.5,0.5, "Too few categories or\noutcomes for plot/test", ha='center', va='center', transform=ax_2_1.transAxes)
#         elif contingency_table.shape[0] < 2: # Need at least 2 categories for chi2
#              stats_summary_text += "Chi-squared test not applicable (needs at least 2 categories).\n"
#              ax_2_1.text(0.5,0.5, "Needs at least 2 categories for Chi2 test", ha='center', va='center', transform=ax_2_1.transAxes)
#         else:
#             chi2, p_chi2, dof, expected = stats.chi2_contingency(contingency_table)
#             stats_summary_text += f"Chi-squared Test of Independence:\n"
#             stats_summary_text += f"  Chi2 Stat: {chi2:.2f}, P-value: {p_chi2:.3g}\n  DOF: {dof}\n"
#             interpretation_text += f"P-value ({p_chi2:.3g}) for Chi-squared test: "
#             interpretation_text += "Suggests " + ("a significant" if p_chi2 < 0.05 else "no significant") + \
#                                    f" association between {feature_col} and {y_named.name}.\n\n"

#             categories = contingency_table.index
#             proportions_transported = []
#             ci_lows = []
#             ci_highs = []
            
#             for cat in categories:
#                 count_transported = contingency_table.loc[cat, 1] if 1 in contingency_table.columns else 0
#                 count_not_transported = contingency_table.loc[cat, 0] if 0 in contingency_table.columns else 0
#                 n_obs_cat = count_transported + count_not_transported
#                 if n_obs_cat > 0:
#                     prop = count_transported / n_obs_cat
#                     low, high = proportion_confint(count_transported, n_obs_cat, method='wilson')
#                     proportions_transported.append(prop)
#                     ci_lows.append(low)
#                     ci_highs.append(high)
#                 else: 
#                     proportions_transported.append(0)
#                     ci_lows.append(0)
#                     ci_highs.append(0)

#             prop_df = pd.DataFrame({
#                 'category': categories, # Already strings due to .astype(str) in crosstab
#                 'proportion_transported': proportions_transported,
#                 'ci_low': ci_lows,
#                 'ci_high': ci_highs
#             })
            
#             ax_2_1.bar(prop_df['category'], prop_df['proportion_transported'], 
#                        yerr=[prop_df['proportion_transported'] - prop_df['ci_low'], prop_df['ci_high'] - prop_df['proportion_transported']],
#                        capsize=5, color='mediumseagreen', alpha=0.7)
#             ax_2_1.set_ylabel(f'Proportion {y_named.name}')
#             ax_2_1.tick_params(axis='x', rotation=45) # Corrected: Apply rotation
#             plt.setp(ax_2_1.get_xticklabels(), ha='right', rotation_mode='anchor') # Corrected: Ensure alignment
#             ax_2_1.axhline(current_feature_data['Transported_numeric'].mean(), color='grey', linestyle='--', label='Overall Mean')
#             if not prop_df.empty: # Only add legend if there's data to plot
#                 ax_2_1.legend(loc='best')

#             interpretation_text += "Error bars on plot show 95% CIs for proportion transported. "
#             interpretation_text += "If CIs for different categories don't overlap much, "
#             interpretation_text += "it suggests a significant difference in transport rates.\n"

#     else: # Numerical feature
#         group0 = current_feature_data[current_feature_data['Transported_numeric'] == 0][feature_col].dropna() # Ensure NaNs are out for tests
#         group1 = current_feature_data[current_feature_data['Transported_numeric'] == 1][feature_col].dropna() # Ensure NaNs are out for tests

#         ax_2_1.set_title(f'{feature_col} Distribution by Target')
#         # For boxplot, ensure data passed has NaNs handled if seaborn version is older
#         sns.boxplot(x='Transported_numeric', y=feature_col, data=current_feature_data.dropna(subset=[feature_col]), 
#                     ax=ax_2_1, palette=palette, hue='Transported_numeric', legend=False)
#         ax_2_1.set_xticklabels([legend_labels[0], legend_labels[1]])
#         ax_2_1.set_xlabel(y_named.name)

#         stats_summary_text += "Normality (Shapiro-Wilk):\n"
#         norm_p0, norm_p1 = -1.0, -1.0 # Initialize as float
#         if len(group0) >=3 : 
#             shapiro_stat0, norm_p0 = stats.shapiro(group0)
#             stats_summary_text += f"  Group 0 (Not Transported): p={norm_p0:.3g}\n"
#         else: stats_summary_text += "  Group 0: Too few samples for normality test.\n"
#         if len(group1) >=3 :
#             shapiro_stat1, norm_p1 = stats.shapiro(group1)
#             stats_summary_text += f"  Group 1 (Transported): p={norm_p1:.3g}\n"
#         else: stats_summary_text += "  Group 1: Too few samples for normality test.\n"
        
#         # Default to Mann-Whitney U if any group has < 3 samples for normality, or if normality fails
#         use_ttest = (norm_p0 > 0.05 or len(group0) < 3) and \
#                     (norm_p1 > 0.05 or len(group1) < 3)

#         if use_ttest and len(group0)>1 and len(group1)>1:
#             levene_stat, levene_p = stats.levene(group0, group1)
#             stats_summary_text += f"Homogeneity of Variances (Levene's test): p={levene_p:.3g}\n"
#             equal_var = levene_p > 0.05

#             t_stat, p_ttest = stats.ttest_ind(group0, group1, equal_var=equal_var) # nan_policy='omit' is default in newer scipy
#             stats_summary_text += f"Independent T-test (equal_var={equal_var}):\n"
#             stats_summary_text += f"  T-statistic: {t_stat:.2f}, P-value: {p_ttest:.3g}\n"
#             interpretation_text += f"P-value ({p_ttest:.3g}) from t-test: "
#             interpretation_text += "Suggests " + ("a significant" if p_ttest < 0.05 else "no significant") + \
#                                    f" difference in mean {feature_col} between groups.\n"
#             ax_2_1.text(0.5, 0.95, f"T-test p-value: {p_ttest:.3g}", ha='center', va='top', transform=ax_2_1.transAxes, fontsize=9, color='red', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

#         elif len(group0)>0 and len(group1)>0: 
#             try:
#                 u_stat, p_mannwhitney = stats.mannwhitneyu(group0, group1, alternative='two-sided') # nan_policy='omit' is default
#                 stats_summary_text += f"Mann-Whitney U Test:\n"
#                 stats_summary_text += f"  U-statistic: {u_stat:.0f}, P-value: {p_mannwhitney:.3g}\n"
#                 interpretation_text += f"P-value ({p_mannwhitney:.3g}) from Mann-Whitney U test: "
#                 interpretation_text += "Suggests " + ("a significant" if p_mannwhitney < 0.05 else "no significant") + \
#                                    f" difference in distributions of {feature_col} between groups.\n"
#                 ax_2_1.text(0.5, 0.95, f"Mann-Whitney p: {p_mannwhitney:.3g}", ha='center', va='top', transform=ax_2_1.transAxes, fontsize=9, color='red', bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))
#             except ValueError as e_mw: 
#                  stats_summary_text += f"Mann-Whitney U Test: Error - {e_mw}\n"
#                  interpretation_text += "Mann-Whitney U test could not be performed (e.g., identical data in groups).\n"
#         else:
#             stats_summary_text += "Not enough data in one or both groups for numerical tests.\n"
#             interpretation_text += "Not enough data to compare groups statistically.\n"

#     interpretation_text += "\nBayesian Perspective:\nA Bayesian approach could provide posterior distributions for parameters "
#     interpretation_text += "(e.g., difference in means/proportions). This offers a richer view of uncertainty "
#     interpretation_text += "and allows direct probability statements about the effect size, rather than just a p-value."

#     ax_2_2.text(0.01, 0.98, stats_summary_text, fontsize=8, va='top', ha='left', wrap=True, family='monospace') # Reduced font size
#     ax_2_2.set_title("Statistical Test Details", fontsize=10)
    
#     ax_2_3.text(0.01, 0.98, interpretation_text, fontsize=8, va='top', ha='left', wrap=True) # Reduced font size
#     ax_2_3.set_title("Interpretation & Bayesian Note", fontsize=10)

#     fig.subplots_adjust(hspace=0.7, wspace=0.4, top=0.93, bottom=0.12, left=0.06, right=0.97) # Adjusted spacing
#     plt.show()

# print("--- Finished generating all plots. ---")


In [7]:
reload_utils()
import pandas as pd
from etl_chain import run_custom_etl_streaming
# --- 1. Run the ETL pipeline from etl_chain.py ---
print(f"Starting ETL process for dataset: {local_dataset_name} with target: {target_column}")
try:
    etl_result = run_custom_etl_streaming(dataset_name=local_dataset_name, target_column=target_column)
    X_train_unprocessed = etl_result.get("X_train_unprocessed")
    X_train = etl_result.get("X_train")
    y_train = etl_result.get("y_train")
    X_val = etl_result.get("X_val")
    y_val = etl_result.get("y_val")    
except Exception as e:
    print(f"An error occurred during the ETL process: {e}")
    raise

from utils.etl import validate_X_schema, validate_y_schema
validate_X_schema(X_train)
validate_X_schema(X_val)
validate_y_schema(y_train)
validate_y_schema(y_val)
# X_train.isna().sum()
print(X_train_unprocessed.shape)
print(X_train.shape)
print(X_val.shape)

from utils.eval import evaluate_feature_feedback
feedback = evaluate_feature_feedback(
    X_train,
    y_train,
    X_val,
    y_val,
    auto_tune_threshold=False,  # 🔍 enable random search
    drop_threshold=0.045,
    n_trials=30,
    auto_drop=True,
)

# Use the optimized result
# X_train = feedback["X_train_clean"]
# X_val = feedback["X_val_clean"]


🔁 Reloaded red_wine_quality.etl_chain, utils.etl, utils.eda, utils.eval, utils.submission
Starting ETL process for dataset: space-titanic with target: Transported
Attempting to load from: /home/jovyan/data/space-titanic/raw/train.csv
Dataset loaded successfully from /home/jovyan/data/space-titanic/raw/train.csv. Shape: (8693, 14)
Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')
--- Fitting pipeline on Training Data ---
Fitting step in chain: RawTransformer


  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)


Fitting step in chain: KNNImputerComponent
Fitting step in chain: NameWordFeatures
Fitting step in chain: FeatureGenerators
Fitting step in chain: ServiceFeaturesComponent

--- Transforming Training Data (Streaming) ---
STREAM_MSG (Train): [ETLChain] Status: starting, Msg: Transformation starting., Progress: N/A
STREAM_MSG (Train): [ETLChain] Status: starting, Msg: Chain transformation starting., Progress: N/A
STREAM_MSG (Train): [ETLChain] Status: in_progress, Msg: Starting sub-step: RawTransformer (1/5), Progress: N/A
STREAM_MSG (Train): [RawTransformer] Status: starting, Msg: [Chain -> RawTransformer] Transformation starting., Progress: N/A
STREAM_MSG (Train): [RawTransformer] Status: in_progress, Msg: [Chain -> RawTransformer] Starting raw transformations., Progress: N/A


  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)


STREAM_MSG (Train): [RawTransformer] Status: in_progress, Msg: [Chain -> RawTransformer] Raw transformations applied., Progress: 1.0
STREAM_MSG (Train): [RawTransformer] Status: completed, Msg: [Chain -> RawTransformer] Transformation complete., Progress: N/A
STREAM_MSG (Train): [ETLChain] Status: in_progress, Msg: Completed sub-step: RawTransformer. Shape after: (6954, 16), Progress: N/A
STREAM_MSG (Train): [ETLChain] Status: in_progress, Msg: Starting sub-step: KNNImputerComponent (2/5), Progress: N/A
STREAM_MSG (Train): [KNNImputerComponent] Status: starting, Msg: [Chain -> KNNImputerComponent] Transformation starting., Progress: N/A
STREAM_MSG (Train): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] Starting KNN imputation., Progress: N/A
STREAM_MSG (Train): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] KNN imputation applied if necessary., Progress: 1.0
STREAM_MSG (Train): [KNNImputerComponent] Status: completed, Msg:

  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)


STREAM_MSG (Val): [RawTransformer] Status: in_progress, Msg: [Chain -> RawTransformer] Raw transformations applied., Progress: 1.0
STREAM_MSG (Val): [RawTransformer] Status: completed, Msg: [Chain -> RawTransformer] Transformation complete., Progress: N/A
STREAM_MSG (Val): [ETLChain] Status: in_progress, Msg: Completed sub-step: RawTransformer. Shape after: (1739, 16), Progress: N/A
STREAM_MSG (Val): [ETLChain] Status: in_progress, Msg: Starting sub-step: KNNImputerComponent (2/5), Progress: N/A
STREAM_MSG (Val): [KNNImputerComponent] Status: starting, Msg: [Chain -> KNNImputerComponent] Transformation starting., Progress: N/A
STREAM_MSG (Val): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] Starting KNN imputation., Progress: N/A
STREAM_MSG (Val): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] KNN imputation applied if necessary., Progress: 1.0
STREAM_MSG (Val): [KNNImputerComponent] Status: completed, Msg: [Chain -> KNNIm

  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)


STREAM_MSG (Test): [RawTransformer] Status: in_progress, Msg: [Chain -> RawTransformer] Raw transformations applied., Progress: 1.0
STREAM_MSG (Test): [RawTransformer] Status: completed, Msg: [Chain -> RawTransformer] Transformation complete., Progress: N/A
STREAM_MSG (Test): [ETLChain] Status: in_progress, Msg: Completed sub-step: RawTransformer. Shape after: (4277, 16), Progress: N/A
STREAM_MSG (Test): [ETLChain] Status: in_progress, Msg: Starting sub-step: KNNImputerComponent (2/5), Progress: N/A
STREAM_MSG (Test): [KNNImputerComponent] Status: starting, Msg: [Chain -> KNNImputerComponent] Transformation starting., Progress: N/A
STREAM_MSG (Test): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] Starting KNN imputation., Progress: N/A
STREAM_MSG (Test): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] KNN imputation applied if necessary., Progress: 1.0
STREAM_MSG (Test): [KNNImputerComponent] Status: completed, Msg: [Chain 

Unnamed: 0,feature,rf_importance,mutual_info,pearson_corr
5,total_spent,0.061795,0.132651,-0.20306
6,max_spend_category,0.053743,0.084929,-0.366842
26,kmeans_dist_4,0.053212,0.102992,-0.402393
31,gmm_cluster_prob_4,0.05081,0.135684,0.312152
1,CryoSleep,0.047056,0.118921,0.462554
18,ica_5,0.044654,0.102559,-0.026516
15,ica_2,0.04233,0.075968,0.340686
25,kmeans_dist_3,0.035811,0.045384,0.277533
14,ica_1,0.034857,0.059122,0.20844
20,ica_7,0.031901,0.065266,0.226743



🧹 Least Informative Features (below threshold):
['ica_5', 'ica_2', 'kmeans_dist_3', 'ica_1', 'ica_7', 'ica_4', 'kmeans_dist_2', 'kmeans_dist_1', 'ica_6', 'ica_3', 'cabin_num', 'kmeans_dist_0', 'gmm_cluster_prob_3', 'gmm_cluster_prob_0', 'gmm_cluster_prob_2', 'Age', 'RoomService_tp25', 'RoomService_tp75', 'name_corr_neg_p_value', 'has_used_RoomService', 'gmm_cluster_prob_1', 'RoomService_tp50', 'name_corr_neg_prob', 'has_used_Spa', 'Spa_tp50', 'deck', 'Spa_tp75', 'Spa_tp25', 'VRDeck_tp25', 'HomePlanet', 'kmeans_cluster', 'has_used_VRDeck', 'name_corr_pos_prob', 'name_corr_pos_p_value', 'side', 'VRDeck_tp50', 'ShoppingMall_tp50', 'has_used_ShoppingMall', 'VRDeck_tp75', 'FoodCourt_tp75', 'ShoppingMall_tp25', 'Destination', 'ShoppingMall_tp75', 'FoodCourt_tp25', 'has_used_FoodCourt', 'FoodCourt_tp50', 'VIP']


Unnamed: 0,feature,rf_importance,mutual_info,pearson_corr
40,has_used_ShoppingMall,0.003929,0.033426,-0.269141
51,VRDeck_tp75,0.003914,0.048045,-0.290732
39,FoodCourt_tp75,0.003681,0.012627,-0.120046
41,ShoppingMall_tp25,0.003445,0.028103,-0.269141
2,Destination,0.003391,0.0106,-0.094455
43,ShoppingMall_tp75,0.003195,0.007285,-0.191316
37,FoodCourt_tp25,0.002359,0.028891,-0.230309
36,has_used_FoodCourt,0.002307,0.021268,-0.230309
38,FoodCourt_tp50,0.001974,0.033053,-0.230309
4,VIP,0.000706,0.0,-0.040546



🔁 RandomForest Accuracy after dropping: 0.7510
🔴 Degraded by -0.0219


In [None]:
from utils.models import get_sklearn_model, get_nn_model
from utils.train import train_sklearn_model, train_nn_model, log_model_artifact, log_final_metrics, evaluate_model, \
                        conditionally_encode_labels
import mlflow
os.environ["WANDB_BASE_URL"] = "http://wandb:8080"
os.environ["WANDB_DEBUG"] = "true"
os.environ["WANDB_DEBUG_LOG_PATH"] = "/tmp/wandb_debug.log"
import wandb
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Define model sweep
print("defining sweep")
model_configs = {
    # "decision_tree": [
    #     {"criterion": c, "max_depth": d, "min_samples_split": m, "min_samples_leaf": l}
    #     for c in ["gini", "entropy"]
    #     for d in [None, 5, 10]
    #     for m in [2, 5]
    #     for l in [1, 2]
    # ],

    "random_forest": [
        {"n_estimators": n, "max_depth": d, "max_features": f, "min_samples_split": m}
        for n in [50, 200, 350]
        for d in [None, 10,15]
        for f in ["sqrt", "log2"]
        for m in [2, 5]
    ],

    # "logistic_regression": [
    #     {"penalty": p, "C": c, "solver": "liblinear", "max_iter": 200}
    #     for p in ["l1", "l2"]
    #     for c in [0.01, 0.1, 1.0]
    # ],

    "xgboost": [
        {"n_estimators": n, "max_depth": d, "learning_rate": lr, "subsample": s, "colsample_bytree": cbt}
        for n in [200]
        for d in [5, 7]
        for lr in [0.005, 0.1]
        for s in [0.8, 1.0]
        for cbt in [0.8, 1.0]
    ],

    "lightgbm": [
        {"n_estimators": n, "max_depth": d, "learning_rate": lr, "num_leaves": nl, "min_child_samples": mcs}
        for n in [100, 200]
        for d in [-1, 10]
        for lr in [0.01, 0.1]
        for nl in [31, 50]
        for mcs in [10, 20]
    ],
    # "svm": [
    #     {"C": c, "kernel": k, "gamma": g}
    #     for c in [0.1, 1.0, 10.0]
    #     for k in ["linear", "rbf", "poly"]
    #     for g in ["scale", "auto"]
    # ],

    # "naive_bayes": [
    #     {"var_smoothing": vs}
    #     for vs in [1e-9, 1e-8, 1e-7]
    # ],

    # "knn": [
    #     {"n_neighbors": k, "weights": w, "metric": m}
    #     for k in [ 5, 7]
    #     for w in ["uniform", "distance"]
    #     for m in ["euclidean", "manhattan"]
    # ],
    # "neural_net": [
    #     {
    #         "model_type": mt,
    #         "hidden": h,
    #         "dropout": d,
    #         "activation": act,
    #         "batch_norm": bn,
    #         "lr": lr,
    #         "num_layers": nl
    #     }
    #     for mt in ["mlp", "lstm", "cnn"]
    #     for h in [32, 64]
    #     for d in [0.0, 0.3]
    #     for act in ["relu", "tanh"]
    #     for bn in [False, True]
    #     for lr in [.005, 0.001, .05, 0.01, .05]
    #     for nl in [1, 2, 3,4]
    # ],
}

mlflow.set_experiment(f"kaggle_{local_dataset_name}")
print("starting experiments")
# Loop over each model and its hyperparam
for model_name, config_list in model_configs.items():
    for params in config_list:
        # Terminate any already running experiments (MLflow and W&B)
        if mlflow.active_run() is not None:
            mlflow.end_run()
        wandb.finish()
        # === Init Experiment | MLflow and W&B ===
        run_name = f"{etl_result['etl_version']}_{model_name}_{params}"
        run = mlflow.start_run(run_name=run_name)
        mlflow.log_param("model_name", model_name)
        mlflow.log_params(params)
        mlflow.log_param("task_type", task_type)
        mlflow.log_param("perf_eval_metric", perf_eval_metric)
        mlflow.log_param("etl_version", etl_result['etl_version'])
        mlflow.log_param("etl_description", etl_result['etl_description'])
        print("Started MLflow run:", run.info.run_id)
        wandb.init(
            project=f"kaggle_{local_dataset_name}",
            name=run_name,
            config={
                **params,
                "model_name": model_name,
                "task_type": task_type,
                "perf_eval_metric": perf_eval_metric,
                "etl_version": etl_result['etl_version'],
                "etl_description": etl_result['etl_description']
            }
        )
        # === Train ===
        if model_name == "neural_net":
            output_dim = y_train.nunique() if task_type == "multiclass_classification" else 1
            model, lr = get_nn_model(X_train.shape[1], output_dim=output_dim, **params)
            y_train_nn, y_val_nn, label_encoder, label_encoder_applied = conditionally_encode_labels(y_train, y_val)
            model, y_pred = train_nn_model(
                model, X_train, y_train_nn, X_val, y_val_nn,
                epochs=20,
                lr = lr,
                task_type=task_type,
                eval_metric_name=eval_metric_name,
                eval_metric_fn=eval_metric_fn,
            )
            if label_encoder_applied:
                y_pred = label_encoder.inverse_transform(y_pred)
            print("✅ Finished training neural net")

        else:
            model = get_sklearn_model(model_name, **params)
            trained_model, y_pred = train_sklearn_model(
                model, X_train, y_train, X_val, y_val,
                task_type=task_type,
            )
            print("✅ Finished training sklearn model")

        # === Evaluate & log ===
        eval_metrics = evaluate_model(y_val, y_pred, task_type)
        log_final_metrics(eval_metrics)
        # commented out to reduced write2dis
        # log_model_artifact(trained_model, model_name, framework="torch" if model_name == "neural_net" else "sklearn")

        # === End Experiment | MLflow and W&B ===
        mlflow.end_run()
        wandb.finish()


In [7]:
import pandas as pd
reload_utils()
from etl_chain import run_custom_etl_streaming
from utils.models import get_sklearn_model 
from utils.train import train_sklearn_model 
from utils.submission import create_submission_file 

# --- 1. Run the ETL pipeline from etl_chain.py ---
print(f"Starting ETL process for dataset: {local_dataset_name} with target: {target_column}")
try:
    etl_result = run_custom_etl_streaming(dataset_name=local_dataset_name, 
                                          target_column=target_column,
                                         test_split = .01)
    
    X_train = etl_result.get("X_train")
    y_train = etl_result.get("y_train")
    X_val = etl_result.get("X_val")
    y_val = etl_result.get("y_val")
    X_test = etl_result.get("X_test")
    passenger_ids = etl_result.get("passenger_ids")
       
except Exception as e:
    print(f"An error occurred during the ETL process: {e}")
    raise

# --- 2. Define Model and Parameters ---
model_name = "lightgbm"
params = {
    "learning_rate": 0.1,
    "max_depth": 10,
    "min_child_samples": 10,
    "n_estimators": 200,
    "num_leaves": 31,
    "random_state": 42 
}

# --- 3. Train the model ---
print(f"\n--- Training {model_name} model ---")
model = get_sklearn_model(model_name, **params)

# Ensure y_train and y_val are pandas Series for train_sklearn_model if it expects that
if not isinstance(y_train, pd.Series):
    y_train = pd.Series(y_train)
if not isinstance(y_val, pd.Series):
    y_val = pd.Series(y_val)
    
model, y_pred_val = train_sklearn_model(
    model, X_train, y_train, X_val, y_val,
    task_type=task_type,
)
print(f"✅ Finished training {model_name} model.")

# --- 5. Make Predictions on Test Data ---
if X_test is not None and passenger_ids is not None:
    print("\n--- Making predictions on test data ---")
    y_pred_test = model.predict(X_test)
    target_is_boolean_flag = (task_type == "binary_classification") # Example logic for boolean target
    
    create_submission_file(
        passenger_ids=passenger_ids,
        predictions=y_pred_test,
        target_column_name=target_column,
        output_filename="submission.csv",
        target_is_boolean=target_is_boolean_flag
    )
else:
    if X_test is None:
        print("\n⚠️ X_test is not available. Cannot generate submission file.")
    if passenger_ids is None:
        print("\n⚠️ passenger_ids are not available. Cannot generate submission file.")



🔁 Reloaded red_wine_quality.etl_chain, utils.etl, utils.eda, utils.eval, utils.submission
Starting ETL process for dataset: space-titanic with target: Transported
Attempting to load from: /home/jovyan/data/space-titanic/raw/train.csv
Dataset loaded successfully from /home/jovyan/data/space-titanic/raw/train.csv. Shape: (8693, 14)
Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')
--- Fitting pipeline on Training Data ---
Fitting step in chain: RawTransformer


  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)


Fitting step in chain: KNNImputerComponent
Fitting step in chain: NameWordFeatures
Fitting step in chain: FeatureGenerators
Fitting step in chain: ServiceFeaturesComponent

--- Transforming Training Data (Streaming) ---
STREAM_MSG (Train): [ETLChain] Status: starting, Msg: Transformation starting., Progress: N/A
STREAM_MSG (Train): [ETLChain] Status: starting, Msg: Chain transformation starting., Progress: N/A
STREAM_MSG (Train): [ETLChain] Status: in_progress, Msg: Starting sub-step: RawTransformer (1/5), Progress: N/A
STREAM_MSG (Train): [RawTransformer] Status: starting, Msg: [Chain -> RawTransformer] Transformation starting., Progress: N/A
STREAM_MSG (Train): [RawTransformer] Status: in_progress, Msg: [Chain -> RawTransformer] Starting raw transformations., Progress: N/A


  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)


STREAM_MSG (Train): [RawTransformer] Status: in_progress, Msg: [Chain -> RawTransformer] Raw transformations applied., Progress: 1.0
STREAM_MSG (Train): [RawTransformer] Status: completed, Msg: [Chain -> RawTransformer] Transformation complete., Progress: N/A
STREAM_MSG (Train): [ETLChain] Status: in_progress, Msg: Completed sub-step: RawTransformer. Shape after: (8606, 15), Progress: N/A
STREAM_MSG (Train): [ETLChain] Status: in_progress, Msg: Starting sub-step: KNNImputerComponent (2/5), Progress: N/A
STREAM_MSG (Train): [KNNImputerComponent] Status: starting, Msg: [Chain -> KNNImputerComponent] Transformation starting., Progress: N/A
STREAM_MSG (Train): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] Starting KNN imputation., Progress: N/A
STREAM_MSG (Train): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] KNN imputation applied if necessary., Progress: 1.0
STREAM_MSG (Train): [KNNImputerComponent] Status: completed, Msg:

  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)
  X_df[col] = X_df[col].fillna(False).astype(bool)


STREAM_MSG (Test): [RawTransformer] Status: in_progress, Msg: [Chain -> RawTransformer] Raw transformations applied., Progress: 1.0
STREAM_MSG (Test): [RawTransformer] Status: completed, Msg: [Chain -> RawTransformer] Transformation complete., Progress: N/A
STREAM_MSG (Test): [ETLChain] Status: in_progress, Msg: Completed sub-step: RawTransformer. Shape after: (4277, 15), Progress: N/A
STREAM_MSG (Test): [ETLChain] Status: in_progress, Msg: Starting sub-step: KNNImputerComponent (2/5), Progress: N/A
STREAM_MSG (Test): [KNNImputerComponent] Status: starting, Msg: [Chain -> KNNImputerComponent] Transformation starting., Progress: N/A
STREAM_MSG (Test): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] Starting KNN imputation., Progress: N/A
STREAM_MSG (Test): [KNNImputerComponent] Status: in_progress, Msg: [Chain -> KNNImputerComponent] KNN imputation applied if necessary., Progress: 1.0
STREAM_MSG (Test): [KNNImputerComponent] Status: completed, Msg: [Chain 