In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# --- Load California housing dataset ---
housing = fetch_california_housing(as_frame=True)
df = housing.frame

# --- Introduce missing values for demonstration ---
df_missing = df.copy()
df_missing.iloc[0:10, 0] = np.nan  # introduce NaNs in the first column
df_missing.iloc[20:25, 5] = np.nan  # introduce NaNs in another column

# --- Imputation: Mean ---
try:
    mean_imputer = SimpleImputer(strategy='mean')
    df_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(df_missing), columns=df.columns)
except Exception as e:
    print("Mean imputation failed:", e)

# --- Imputation: Median ---
try:
    median_imputer = SimpleImputer(strategy='median')
    df_median_imputed = pd.DataFrame(median_imputer.fit_transform(df_missing), columns=df.columns)
except Exception as e:
    print("Median imputation failed:", e)

# --- Imputation: KNN ---
try:
    knn_imputer = KNNImputer(n_neighbors=3)
    df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_missing), columns=df.columns)
except Exception as e:
    print("KNN imputation failed:", e)

# --- Scaling ---
scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler()
}
scaled_dfs = {}
for name, scaler in scalers.items():
    try:
        scaled_data = scaler.fit_transform(df.fillna(0))  # use original (non-missing) for scaling
        scaled_dfs[name] = pd.DataFrame(scaled_data, columns=df.columns)
    except Exception as e:
        print(f"{name} failed:", e)

# --- Feature Selection: Correlation ---
try:
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    df_uncorrelated = df.drop(columns=to_drop)
    print("Dropped due to high correlation:", to_drop)
except Exception as e:
    print("Correlation filtering failed:", e)

# --- Feature Selection: Mutual Information ---
try:
    X = df.drop(columns=["MedHouseVal"])
    y = df["MedHouseVal"]
    mi = mutual_info_regression(X.fillna(0), y)
    mi_scores = pd.Series(mi, index=X.columns)
    important_features = mi_scores[mi_scores > 0.01].index.tolist()
    df_mi_selected = df[important_features + ["MedHouseVal"]]
    print("Selected by MI:", important_features)
except Exception as e:
    print("Mutual information selection failed:", e)

# --- Feature Selection: Variance Threshold ---
try:
    selector = VarianceThreshold(threshold=0.01)
    df_variance_selected = selector.fit_transform(df.fillna(0))
    selected_columns = df.columns[selector.get_support()]
    df_var_selected = pd.DataFrame(df_variance_selected, columns=selected_columns)
    print("Selected by VarianceThreshold:", list(selected_columns))
except Exception as e:
    print("Variance threshold failed:", e)


Dropped due to high correlation: ['Longitude']
Selected by MI: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Selected by VarianceThreshold: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal']
