Multiple Imputation by Chained Equations(MICE)

In [2]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#load the dataset
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
#check the missing values

df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
sex              0
pclass           0
survived         0
fare             0
parch            0
sibsp            0
class            0
adult_male       0
who              0
alive            0
alone            0
dtype: int64

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
'''#encode the data using label encoding
from sklearn.preprocessing import LabelEncoder
# columns to encode
columns_to_encode=['sex','embarked','class','who','adult_male','deck','embark_town','alive','alone']
encoder=LabelEncoder()
df['sex']=encoder.fit_transform(df['sex'])
df['embarked']=encoder.fit_transform(df['embarked'])
df['class']=encoder.fit_transform(df['class'])
df['who']=encoder.fit_transform(df['who'])
df['adult_male']=encoder.fit_transform(df['adult_male'])
df['deck']=encoder.fit_transform(df['deck'])
df['embark_town']=encoder.fit_transform(df['embark_town'])
df['alive']=encoder.fit_transform(df['alive'])
df['alone']=encoder.fit_transform(df['alone'])'''

# OR we can also do the same using for loop 
from sklearn.preprocessing import LabelEncoder
# columns to encode
columns_to_encode=['sex','embarked','class','who','adult_male','deck','embark_town','alive','alone']
#Dictionary to store LabelEncoders for each column
label_encoders={}
#loop to apply LabelEncoder for the column
for col in columns_to_encode:
    #create a new LabelEncoder for the column
    labele=LabelEncoder()
    #fit and transform the data
    df[col]=labele.fit_transform(df[col])
    #store the encoder in the dictionary
    label_encoders[col]=labele
df.head()

 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,1,7,2,0,0
1,1,1,0,38.0,1,0,71.2833,0,0,2,0,2,0,1,0
2,1,3,0,26.0,0,0,7.925,2,2,2,0,7,2,1,1
3,1,1,0,35.0,1,0,53.1,2,0,2,0,2,2,1,0
4,0,3,1,35.0,0,0,8.05,2,2,1,1,7,2,0,1


In [6]:
# impute the missing values with IterativeImputer
# call the IterativeImputer(max_iter=10)
imputer=IterativeImputer(max_iter=10)
#impute missing values using IterativeImputer in a for loop for age,embark_town,embarked,deck
colums_to_impute=['age','embark_town','embarked','deck']
#loop the impute to impute each column
for col in colums_to_impute:
    df[col]=imputer.fit_transform(df[[col]])
#check the missing values
df.isnull().sum().sort_values(ascending=False)


survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [7]:
df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2.0,2,1,1,7.0,2.0,0,0
1,1,1,0,38.0,1,0,71.2833,0.0,0,2,0,2.0,0.0,1,0
2,1,3,0,26.0,0,0,7.925,2.0,2,2,0,7.0,2.0,1,1
3,1,1,0,35.0,1,0,53.1,2.0,0,2,0,2.0,2.0,1,0
4,0,3,1,35.0,0,0,8.05,2.0,2,1,1,7.0,2.0,0,1


In [8]:
#inverse transform for encoded columns
for col in columns_to_encode:
    #Retrivethe correspponding LabelEncoder for the column
    le = label_encoders[col]
    #Inverse transform the data and convert to interger type
    df[col]=le.inverse_transform(df[col].astype(int))
df.head()




Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


##### Regression imputation
Regression imputation uses a regression model to predict the missing values based on other variables in the dataset.It works well for both categorical and numerical data.
Let's see how to implement imputation in Python using the Titanic dataset.

In [9]:
#load dataset
data=sns.load_dataset('titanic')

#check the number of missing values in each column
data.isnull().sum().sort_values(ascending=False)


deck           688
age            177
embarked         2
embark_town      2
sex              0
pclass           0
survived         0
fare             0
parch            0
sibsp            0
class            0
adult_male       0
who              0
alive            0
alone            0
dtype: int64

In [10]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


Random Forests for Imputing Missing Values
    
    

In [11]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,mean_absolute_percentage_error,root_mean_squared_error
from sklearn.impute import SimpleImputer

#load the dataset
data=sns.load_dataset('titanic')

#check missing values in each column
df.isnull().sum().sort_values(ascending=False)



deck           688
embarked         2
embark_town      2
age              0
survived         0
pclass           0
sex              0
fare             0
parch            0
sibsp            0
class            0
adult_male       0
who              0
alive            0
alone            0
dtype: int64

In [12]:
# remove deck column
data.drop('deck', axis=1, inplace=True)

# check missing values in each column
data.isnull().sum().sort_values(ascending=False)

age            177
embarked         2
embark_town      2
sex              0
pclass           0
survived         0
parch            0
sibsp            0
class            0
fare             0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [13]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [14]:
#encode the data using label encoding
from sklearn.preprocessing import LabelEncoder
#column to encode
columns_to_encode=['sex','embarked','who','class','embark_town','alive']

#dictionary to store LabelEncoders for each column
label_encoders={}
#Loop to apply labellEncodder to each column
for col in columns_to_encode:
    #Create a new LabelEncoder for the column
    le=LabelEncoder()
    #Fit and transform the data ,then inverse transform it
    data[col]=le.fit_transform(data[col])
    #store the encoder in the dictionary
    label_encoders[col]=le
#check the first few rows of the DataFrame
data.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,2,0,True


we have to first impute the missing values in the gae column before we can use it to predict the missing values in the embarked and emark_town columns.


In [15]:
#split the dataset into two parts :one with the missing values,one without
data_with_missing=data[data['age'].isna()]#isna is a Pandas function used to detect missing (NaN) values in a DataFrame or Series. 
#dropna removes all rows with missing values
data_without_missing=data.dropna()

In [16]:
print("The shape of the original dataset is:",data.shape)
print("The shape of the dataset with missing values removed is:",data_without_missing.shape)
print("The shape of the dataset with missing values is:",data_with_missing.shape)

The shape of the original dataset is: (891, 14)
The shape of the dataset with missing values removed is: (714, 14)
The shape of the dataset with missing values is: (177, 14)


In [17]:
data_with_missing.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
5,0,3,1,,0,0,8.4583,1,2,1,True,1,0,True
17,1,2,1,,0,0,13.0,2,1,1,True,2,1,True
19,1,3,0,,0,0,7.225,0,2,2,False,0,1,True
26,0,3,1,,0,0,7.225,0,2,1,True,0,0,True
28,1,3,0,,0,0,7.8792,1,2,2,False,1,1,True


In [18]:
data_without_missing.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,2,0,True


In [19]:
#check the names of the columns
print(data.columns)


Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Separate features and target
X = data_without_missing.drop(['age'], axis=1)
y = data_without_missing['age']

#Identify categorical columns
#We check which columns are non-numeric (like names, categories, etc.)
#This is important because models like RandomForestRegressor only work with numeric data.

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Create a ColumnTransformer to encode categorical columns
#OneHotEncoder: Converts categorical columns into binary columns (one column per unique value).
#ColumnTransformer: Applies this encoding only to categorical columns.
#remainder='passthrough': Keeps all non-categorical (numerical) columns unchanged.
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Keep numeric columns unchanged
)

# Fit and transform X
#fit_transform: learns how to transform the data and then actually transforms it.
#Now X_encoded is completely numeric, and ready for modeling.
X_encoded = preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.20, random_state=42)

# Random Forest Imputation
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)

print("RMSE for Random Forest Imputation: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score for Random Forest Imputation: ", r2_score(y_test, y_pred))
print("MAE for Random Forest Imputation: ", mean_absolute_error(y_test, y_pred))
print("MAPE for Random Forest Imputation: ", mean_absolute_percentage_error(y_test, y_pred))


RMSE for Random Forest Imputation:  11.081260589808045
R2 Score for Random Forest Imputation:  0.33769388288226154
MAE for Random Forest Imputation:  8.666661815622195
MAPE for Random Forest Imputation:  0.40839466096086574


In [21]:
print(X.dtypes)
print(X.select_dtypes(include=['object']).head())


survived         int64
pclass           int64
sex              int64
sibsp            int64
parch            int64
fare           float64
embarked         int64
class            int64
who              int64
adult_male        bool
embark_town      int64
alive            int64
alone             bool
dtype: object
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [22]:
#check the number of missing values in each column
data_with_missing.isnull().sum().sort_values(ascending=False)


age            177
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
embark_town      0
alive            0
alone            0
dtype: int64

In [23]:
#predict missing values
y_pred=rf_model.predict(data_with_missing.drop(['age'],axis=1))

In [24]:
y_pred

array([32.97658333, 35.64221825, 18.347     , 35.57148611, 20.65142857,
       26.7619855 , 36.648     , 18.63142857, 21.80633333, 33.55618169,
       31.06587652, 35.90741667, 18.63142857, 24.824     , 31.03      ,
       39.405     , 25.849     , 26.7619855 , 31.06587652, 19.41142857,
       31.06587652, 31.06587652, 26.7619855 , 26.27095821, 29.23514286,
       31.06587652, 48.25650595, 27.94      , 31.87071429, 31.99628481,
       30.015     , 20.85816667, 33.755     , 60.19168831, 26.00185714,
       26.24316667, 28.91733333, 49.31      , 28.55277778, 48.25650595,
       18.63142857, 20.85816667, 33.78929167, 26.7619855 , 26.63      ,
       32.01066667, 28.22883333, 28.55277778, 31.99628481, 29.72904762,
       48.25650595, 27.67733333, 56.26333333, 18.63142857, 34.65645944,
       60.44168831, 39.405     , 35.7725    , 18.63142857, 24.78266667,
       34.305     , 31.06587652, 31.602     , 20.85816667, 25.296     ,
       36.97133333, 26.7619855 , 24.85777778, 55.52      , 35.57

In [25]:
#remove warning
import warnings
warnings.filterwarnings('ignore')
#replace the missing values with the predicted values
data_with_missing['age']=y_pred
#check the missing values
data_with_missing.isnull().sum().sort_values(ascending=False)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [27]:
#concatenate the two dataframes
data_complete =pd.concat([data_with_missing,data_without_missing],axis=0)
#print the shape of the complete dataframe
print("The shape of the complete dataframe is:",data_complete.shape)
#check the first 5 rows of the complete dataframe
data_complete.head()

The shape of the complete dataframe is: (891, 14)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
5,0,3,1,32.976583,0,0,8.4583,1,2,1,True,1,0,True
17,1,2,1,35.642218,0,0,13.0,2,1,1,True,2,1,True
19,1,3,0,18.347,0,0,7.225,0,2,2,False,0,1,True
26,0,3,1,35.571486,0,0,7.225,0,2,1,True,0,0,True
28,1,3,0,20.651429,0,0,7.8792,1,2,2,False,1,1,True


In [28]:
for col in columns_to_encode:
    #retrive the corresponding LabelEncoder for the column
    le=label_encoders[col]
    #inverse transform the data
    data_complete[col]=le.inverse_transform(data[col])
#check the first 5 rows of the complete dataframe
data_complete.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
5,0,3,male,32.976583,0,0,8.4583,S,Third,man,True,Southampton,no,True
17,1,2,female,35.642218,0,0,13.0,C,First,woman,True,Cherbourg,yes,True
19,1,3,female,18.347,0,0,7.225,S,Third,woman,False,Southampton,yes,True
26,0,3,female,35.571486,0,0,7.225,S,First,woman,True,Southampton,yes,True
28,1,3,male,20.651429,0,0,7.8792,S,Third,man,False,Southampton,no,True


In [35]:
#print the shape of the complete dataframe
print("The shape of the complete daaframe is:",data_complete.shape)

The shape of the complete daaframe is: (891, 14)


In [36]:
data_complete.to_csv('titanic_complete.csv',index=False)

In [38]:
#check the number of the missing values in each column
data_complete.isnull().sum().sort_values(ascending=False)

embark_town    2
embarked       2
sex            0
age            0
survived       0
pclass         0
parch          0
sibsp          0
class          0
fare           0
who            0
adult_male     0
alive          0
alone          0
dtype: int64