In [None]:
feature_space_df = separate_feature_space_label(train_dat, label_col='sii') # (3960, 58)

In [None]:
X_train = train_dat.drop(['sii', 'id'], axis=1) # (3960, 59)
y_train = train_dat['sii'] # (3960, )

In [None]:
X_train.isna().sum()

In [None]:
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist() #48
cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist() #11

In [None]:
def handle_feature_space_missing_values(df, n_neighbors=5, output_path=None, mode=None):
    """
    Handle missing values in the feature space using KNNImputer with one-hot encoding for categorical columns.

    Args:
        df (pd.DataFrame): Input DataFrame.
        n_neighbors (int): Number of neighbors to use for KNN imputation.
        output_path (str): Directory to save the output file (optional).
        mode (str): Mode name for output file (optional).

    Returns:
        pd.DataFrame: DataFrame with missing values imputed.
    """
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.impute import KNNImputer
    import os

    # Identify categorical and numerical columns
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Initialize OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # Encode categorical columns
    df_encoded = df.copy()
    encoded_categorical = encoder.fit_transform(df[categorical_columns])
    encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_columns))
    df_encoded = df_encoded.drop(columns=categorical_columns).reset_index(drop=True)
    df_encoded = pd.concat([df_encoded, encoded_categorical_df], axis=1)
    print(f"Shape after encoding: {df_encoded.shape}")

    # Impute missing values
    knn_imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed = pd.DataFrame(knn_imputer.fit_transform(df_encoded), columns=df_encoded.columns)
    print(f"Shape after imputation: {df_imputed.shape}")

    # Inverse transform the one-hot encoded columns
    encoded_categorical_imputed = df_imputed[encoder.get_feature_names_out(categorical_columns)]
    decoded_categorical = encoder.inverse_transform(encoded_categorical_imputed)
    decoded_categorical_df = pd.DataFrame(decoded_categorical, columns=categorical_columns)
    df_imputed = df_imputed.drop(columns=encoder.get_feature_names_out(categorical_columns)).reset_index(drop=True)
    df_imputed = pd.concat([df_imputed, decoded_categorical_df], axis=1)
    print(f"Shape after decoding: {df_imputed.shape}")

    # Ensure data types match the original DataFrame
    for col in df.columns:
        df_imputed[col] = df_imputed[col].astype(df[col].dtype)

    # Save the output to CSV if output_path is provided
    if output_path is not None:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        df_imputed.to_csv(f"{output_path}/{mode}.handle.missing.values.feature.space.csv", index=False)

    return df_imputed

In [None]:
df_imputed = handle_feature_space_missing_values(feature_space_df, n_neighbors=5, output_path=output_path, mode='train')

In [None]:
df_imputed.isna().sum()