In [1]:
# Question: Predictive Imputation Using Machine Learning
# Description: Use a simple predictive model to impute missing values in a column.

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Sample data
data = {
    'A': [1, 2, None, 4, 5, None, 7],
    'B': [10, 20, 30, 40, 50, 60, 70],
    'C': [5, 4, 3, 2, 1, 0, -1]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Step 1: Identify rows where column 'A' has missing values
missing_values = df[df['A'].isnull()]

# Step 2: Prepare data for the predictive model
# We will use columns 'B' and 'C' to predict the missing values in 'A'

# Remove rows with missing values for training the model
train_data = df.dropna(subset=['A'])

# Define the feature columns and target column
X_train = train_data[['B', 'C']]  # Features
y_train = train_data['A']         # Target (column with missing values)

# Step 3: Train a Random Forest Regressor
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Step 4: Use the model to predict missing values
X_missing = missing_values[['B', 'C']]
predicted_values = model.predict(X_missing)

# Step 5: Replace missing values with the predicted values
df.loc[df['A'].isnull(), 'A'] = predicted_values

print(df)


      A   B  C
0  1.00  10  5
1  2.00  20  4
2  3.23  30  3
3  4.00  40  2
4  5.00  50  1
5  5.80  60  0
6  7.00  70 -1
