<a href="https://colab.research.google.com/github/astrovishalthakur/MachineLearning/blob/main/FeatureEngineering/HandlingMissingData/Iterative_Imputer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MICE stands for Multivariate Imputation by Chained Equations

Assumptions:
1. Missing completely at random.(We can not get the missing data back)
2. Missing at random.(We can fill the missing data using data present)
3. Missing not at random.(data was removed delibrately)

# MICE is used when data is MAR. 
## Missing at random means we can fill missing data using data from other columns.

advantage: high accuracy.<br>
disadvantage: slow, high mem usage.

# MICE is always implemented on input columns.

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = "https://raw.githubusercontent.com/astrovishalthakur/100-days-of-machine-learning/main/day40-iterative-imputer/50_Startups.csv"

In [3]:
df = pd.read_csv(url)

In [4]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
from sklearn.linear_model import LinearRegression

In [6]:
df = np.round(df[["R&D Spend", "Administration", "Marketing Spend", "Profit"]]/1000)

In [7]:
np.random.seed(9)

In [8]:
df = df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,78.0,154.0,300.0,111.0
37,44.0,51.0,197.0,90.0
2,153.0,101.0,408.0,191.0
14,120.0,157.0,257.0,133.0
44,22.0,155.0,28.0,65.0


In [9]:
df = df.iloc[:, 0:-1]
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,44.0,51.0,197.0
2,153.0,101.0,408.0
14,120.0,157.0,257.0
44,22.0,155.0,28.0


In [10]:
df.iloc[1,0] = np.NaN
df.iloc[3,1] = np.NaN
df.iloc[-1,-1] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports un

In [11]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,,51.0,197.0
2,153.0,101.0,408.0
14,120.0,,257.0
44,22.0,155.0,


In [12]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())


In [13]:
# 0th Iteration
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,93.25,51.0,197.0
2,153.0,101.0,408.0
14,120.0,115.25,257.0
44,22.0,155.0,290.5


In [14]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,,51.0,197.0
2,153.0,101.0,408.0
14,120.0,115.25,257.0
44,22.0,155.0,290.5


In [15]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
21,154.0,300.0
2,101.0,408.0
14,115.25,257.0
44,155.0,290.5


In [16]:
y = df1.iloc[[0,2,3,4],0]
y

21     78.0
2     153.0
14    120.0
44     22.0
Name: R&D Spend, dtype: float64

In [17]:
lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df1.iloc[1,1:].values.reshape(1,2))

  "X does not have valid feature names, but"


In [18]:
k = np.round(k[0], 2)
k

230.51

In [19]:
df1.iloc[1, 0] = k

In [20]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,230.51,51.0,197.0
2,153.0,101.0,408.0
14,120.0,115.25,257.0
44,22.0,155.0,290.5


In [21]:
# Remove the col2 imputed value

df1.iloc[3,1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,230.51,51.0,197.0
2,153.0,101.0,408.0
14,120.0,,257.0
44,22.0,155.0,290.5


In [22]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,78.0,300.0
37,230.51,197.0
2,153.0,408.0
44,22.0,290.5


In [23]:
y = df1.iloc[[0,1,2,4],1]
y

21    154.0
37     51.0
2     101.0
44    155.0
Name: Administration, dtype: float64

In [24]:
lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))

  "X does not have valid feature names, but"


In [25]:
k = np.round(k[0], 2)
k

113.56

In [26]:
df1.iloc[3, 1] = k

In [27]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,230.51,51.0,197.0
2,153.0,101.0,408.0
14,120.0,113.56,257.0
44,22.0,155.0,290.5


In [28]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,230.51,51.0,197.0
2,153.0,101.0,408.0
14,120.0,113.56,257.0
44,22.0,155.0,


In [29]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
21,78.0,154.0
37,230.51,51.0
2,153.0,101.0
14,120.0,113.56


In [30]:
y = df1.iloc[0:4,-1]
y

21    300.0
37    197.0
2     408.0
14    257.0
Name: Marketing Spend, dtype: float64

In [31]:
lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df1.iloc[4,0:2].values.reshape(1,2))

  "X does not have valid feature names, but"


In [32]:
k = np.round(k[0],2)

In [33]:
k

264.14

In [34]:
df1.iloc[4,-1] = k

In [35]:
# After 1st Iteration
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,230.51,51.0,197.0
2,153.0,101.0,408.0
14,120.0,113.56,257.0
44,22.0,155.0,264.14


In [36]:
# Subtract 0th iteration from 1st iteration

df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,137.26,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-1.69,0.0
44,0.0,0.0,-26.36


In [37]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,,51.0,197.0
2,153.0,101.0,408.0
14,120.0,113.56,257.0
44,22.0,155.0,264.14


In [38]:
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df2.iloc[1,1:].values.reshape(1,2))

  "X does not have valid feature names, but"


In [39]:
k = np.round(k[0],2)
k

205.65

In [40]:
df2.iloc[1,0] = k

In [41]:
df2.iloc[3,1] = np.NaN
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))

  "X does not have valid feature names, but"


In [42]:
k = np.round(k[0],2)
k

108.32

In [43]:
df2.iloc[3,1] = k

In [44]:
df2.iloc[4,-1] = np.NaN

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df2.iloc[4,0:2].values.reshape(1,2))

  "X does not have valid feature names, but"


In [45]:
k = np.round(k[0], 2)
k

37.15

In [46]:
df2.iloc[4,-1] = k

In [47]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,205.65,51.0,197.0
2,153.0,101.0,408.0
14,120.0,108.32,257.0
44,22.0,155.0,37.15


In [48]:
df2-df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,-24.86,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-5.24,0.0
44,0.0,0.0,-226.99


In [49]:
df3 = df2.copy()

df3.iloc[1,0] = np.NaN

df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,,51.0,197.0
2,153.0,101.0,408.0
14,120.0,108.32,257.0
44,22.0,155.0,37.15


In [50]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df3.iloc[1,1:].values.reshape(1,2))

  "X does not have valid feature names, but"


In [51]:
k = np.round(k[0], 2)
k

166.79

In [52]:
df3.iloc[1,0] = k

In [53]:
df3.iloc[3,1] = np.NaN
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))

  "X does not have valid feature names, but"


In [54]:
k = np.round(k[0], 2)
k

105.18

In [55]:
df3.iloc[3,1] = k

In [56]:
df3.iloc[4,-1] = np.NaN

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
k = lr.predict(df3.iloc[4,0:2].values.reshape(1,2))

  "X does not have valid feature names, but"


In [57]:
k = np.round(k[0], 2)
k

4.31

In [58]:
df3.iloc[4,-1] = k

In [59]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,78.0,154.0,300.0
37,166.79,51.0,197.0
2,153.0,101.0,408.0
14,120.0,105.18,257.0
44,22.0,155.0,4.31


In [60]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,-38.86,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-3.14,0.0
44,0.0,0.0,-32.84


# Theoretically, We keep doing this until difference becomes 0.