In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



In [2]:
# --- When to Use Regression Imputation and its Requirements ---
#
# Regression imputation is a sophisticated method for handling missing data. 
# It's best used when you believe there is a linear relationship between the variable 
# with missing values (the target) and other variables in the dataset (the features).
#
# Requirements:
# 1.  The variables used to predict the missing values should not have missing data themselves.
#     If they do, those must be handled first.
# 2.  A linear relationship between the feature variables and the target variable should exist.
#     If the relationship is highly non-linear, this method may not be accurate.
# 3.  The data used for the regression model should be numerical. Categorical variables
#     would need to be encoded.




In [3]:
# 1. Load a sample dataset
# We'll use the California Housing dataset from scikit-learn.
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)

# For demonstration, we'll focus on imputing missing values in 'MedInc' (Median Income)
# based on 'HouseAge' and 'AveRooms'.
print("Original DataFrame head:")
print(df[['HouseAge', 'AveRooms', 'MedInc']].head())
print("\n")



Original DataFrame head:
   HouseAge  AveRooms  MedInc
0      41.0  6.984127  8.3252
1      21.0  6.238137  8.3014
2      52.0  8.288136  7.2574
3      52.0  5.817352  5.6431
4      52.0  6.281853  3.8462




In [4]:
# 2. Artificially create missing values in the 'MedInc' column
# This allows us to demonstrate the technique and evaluate its performance.
df_missing = df.copy()
missing_fraction = 0.1
np.random.seed(42)
missing_indices = np.random.choice(df_missing.index, size=int(len(df_missing) * missing_fraction), replace=False)
df_missing.loc[missing_indices, 'MedInc'] = np.nan

print(f"Number of missing values created in 'MedInc': {df_missing['MedInc'].isnull().sum()}")
print("DataFrame head with missing values:")
print(df_missing[['HouseAge', 'AveRooms', 'MedInc']].head())
print("\n")



Number of missing values created in 'MedInc': 2064
DataFrame head with missing values:
   HouseAge  AveRooms  MedInc
0      41.0  6.984127  8.3252
1      21.0  6.238137  8.3014
2      52.0  8.288136  7.2574
3      52.0  5.817352     NaN
4      52.0  6.281853  3.8462




In [5]:
# 3. Apply Regression Imputation
# We will predict the missing 'MedInc' values using 'HouseAge' and 'AveRooms'.

# Separate the data into two parts: one with 'MedInc' present, and one with it missing.
df_with_medinc = df_missing.dropna(subset=['MedInc'])
df_to_impute = df_missing[df_missing['MedInc'].isnull()]

# Define the features (X) and the target (y) for the regression model
features = ['HouseAge', 'AveRooms']
X_train = df_with_medinc[features]
y_train = df_with_medinc['MedInc']
X_to_impute = df_to_impute[features]

# Create and train the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict the missing 'MedInc' values
predicted_medinc = lr.predict(X_to_impute)



In [6]:
# 4. Fill the missing values with the predictions
df_imputed = df_missing.copy()
df_imputed.loc[df_imputed['MedInc'].isnull(), 'MedInc'] = predicted_medinc



In [7]:
# 5. Show the result
print("DataFrame head after regression imputation:")
print(df_imputed[['HouseAge', 'AveRooms', 'MedInc']].head())
print("\n")

# Verify that there are no more missing values
print(f"Number of missing values in 'MedInc' after imputation: {df_imputed['MedInc'].isnull().sum()}")
print("\n")

# Optional: Evaluate the imputation quality
# We can compare the imputed values to the original, true values.
original_values = df.loc[missing_indices, 'MedInc']
imputed_values = df_imputed.loc[missing_indices, 'MedInc']

mse = mean_squared_error(original_values, imputed_values)
print(f"Mean Squared Error of the imputation: {mse:.4f}")

# Display a comparison of original vs. imputed values for the first 5 missing entries
comparison_df = pd.DataFrame({
    'Original MedInc': original_values.head(),
    'Imputed MedInc': imputed_values.head()
})
print("\nComparison of Original vs. Imputed values:")
print(comparison_df)

DataFrame head after regression imputation:
   HouseAge  AveRooms    MedInc
0      41.0  6.984127  8.325200
1      21.0  6.238137  8.301400
2      52.0  8.288136  7.257400
3      52.0  5.817352  3.701051
4      52.0  6.281853  3.846200


Number of missing values in 'MedInc' after imputation: 0


Mean Squared Error of the imputation: 3.3367

Comparison of Original vs. Imputed values:
       Original MedInc  Imputed MedInc
20046           1.6812        3.624473
3024            2.5313        3.765791
15663           3.4801        3.272739
20484           5.7376        4.172719
9814            3.7250        3.826676
