In [1]:
import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler 

# 1. Load a real dataset
X, y = fetch_california_housing(return_X_y=True)
X = pd.DataFrame(X, columns=fetch_california_housing().feature_names)

# 2. Artificially introduce missing values for the example
rng = np.random.RandomState(42)
missing_rate = 0.3
# Create a mask for missing values (True means missing)
mask = rng.rand(*X.shape) < missing_rate
X_missing = X.copy()
X_missing[mask] = np.nan

print("Number of missing values introduced:", X_missing.isnull().sum().sum())

# Scale the data before imputation to help convergence
scaler = StandardScaler()
# Fit and transform the missing data using the scaler
X_missing_scaled = scaler.fit_transform(X_missing)
# NOTE: We now work with the numpy array X_missing_scaled, not the pandas DataFrame X_missing

# 3. Define and apply the Iterative Imputer
# The IterativeImputer is still experimental, requiring the explicit import
# We increased max_iter to 200 to address the warning
imputer = IterativeImputer(random_state=42, max_iter=100)

# Apply imputation on the SCALED data
X_imputed_scaled = imputer.fit_transform(X_missing_scaled)

# Convert the imputed, scaled numpy array back to original scale and into a pandas DataFrame
X_imputed_final = scaler.inverse_transform(X_imputed_scaled) # Convert back to original scale
X_imputed = pd.DataFrame(X_imputed_final, columns=X.columns)


print("Total missing values remaining:", X_imputed.isnull().sum().sum())
print("\nSnippet of imputed data (first 5 rows):\n", X_imputed.head())



Number of missing values introduced: 49430
Total missing values remaining: 0

Snippet of imputed data (first 5 rows):
      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup   Latitude  \
0  8.325200      41.0  6.984127   1.023810   538.42184 -1.830732  37.354704   
1  8.301400      21.0  6.871610   0.971880  2401.00000 -0.715959  34.936508   
2  7.257400      52.0  8.288136   1.302720   496.00000 -0.042194  37.461994   
3  5.643100      52.0  6.379317   1.073059   558.00000  0.060120  37.850000   
4  4.622656      52.0  6.281853   1.081081   565.00000  2.919561  37.850000   

    Longitude  
0 -122.230000  
1 -119.591920  
2 -122.240000  
3 -122.352243  
4 -122.250000  


In [2]:
import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler 

# 1. Load a real dataset

data = load_iris()
X = pd.DataFrame(data=data.data, columns=data.feature_names)

# 2. Artificially introduce missing values for the example
rng = np.random.RandomState(42)
missing_rate = 0.3
# Create a mask for missing values (True means missing)
mask = rng.rand(*X.shape) < missing_rate
X_missing = X.copy()
X_missing[mask] = np.nan

print("Number of missing values introduced:", X_missing.isnull().sum().sum())

# Scale the data before imputation to help convergence
scaler = StandardScaler()
# Fit and transform the missing data using the scaler
X_missing_scaled = scaler.fit_transform(X_missing)
# NOTE: We now work with the numpy array X_missing_scaled, not the pandas DataFrame X_missing

# 3. Define and apply the Iterative Imputer
# The IterativeImputer is still experimental, requiring the explicit import
# We increased max_iter to 200 to address the warning
imputer = IterativeImputer(random_state=42, max_iter=200)

# Apply imputation on the SCALED data
X_imputed_scaled = imputer.fit_transform(X_missing_scaled)

# Convert the imputed, scaled numpy array back to original scale and into a pandas DataFrame
X_imputed_final = scaler.inverse_transform(X_imputed_scaled) # Convert back to original scale
X_imputed = pd.DataFrame(X_imputed_final, columns=X.columns)


print("Total missing values remaining:", X_imputed.isnull().sum().sum())
print("\nSnippet of imputed data (first 5 rows):\n", X_imputed.head())



Number of missing values introduced: 181
Total missing values remaining: 0

Snippet of imputed data (first 5 rows):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           5.100000          3.500000           1.400000          0.200000
1           4.897792          3.225044           1.472040          0.200000
2           4.700000          3.200000           1.336796          0.200000
3           4.600000          3.117292           1.319896          0.196317
4           5.000000          3.600000           1.400000          0.308688


In [3]:
from sklearn.metrics import mean_squared_error
import numpy as np

# --- Code to add AFTER the imputation process is complete ---

# 1. Isolate the original true values that were removed
# 'mask' is the boolean mask that defined where the NaNs were
# X (original dataframe) indexed by the mask gives the true values
true_values = X[mask]

# 2. Isolate the imputed values at those exact same locations
# X_imputed (final dataframe) indexed by the mask gives the generated values
imputed_values = X_imputed[mask]

# 3. Calculate the performance metrics column by column (or aggregate)

## Example: Aggregate RMSE across all features
# First, flatten the arrays to compare all values as a single list
true_flat = true_values.values.flatten()
imputed_flat = imputed_values.values.flatten()

# Remove any potential remaining NaNs (if any, though in your case it should be 0)
# This step is mainly for robustness
valid_comparison_mask = ~np.isnan(true_flat) & ~np.isnan(imputed_flat)
true_flat = true_flat[valid_comparison_mask]
imputed_flat = imputed_flat[valid_comparison_mask]

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(true_flat, imputed_flat))

print(f"\nOverall RMSE of imputed values vs true values: {rmse:.4f}")

# Example: MAE for a single column, e.g., 'MedInc'
col_name = 'MedInc'
if col_name in true_values.columns:
    mae_col = np.mean(np.abs(true_values[col_name] - imputed_values[col_name]))
    print(f"Mean Absolute Error for '{col_name}': {mae_col:.4f}")



Overall RMSE of imputed values vs true values: 0.3494
