In [14]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression

In [15]:
df = np.round(pd.read_csv('50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
np.random.seed(9)

df = df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


In [16]:
df = df.iloc[:,0:-1]    # Remove the Target Column(Profit)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [17]:
df.iloc[1,0] = np.NaN   # Introduce Missing Values
df.iloc[3,1] = np.NaN
df.iloc[-1,-1] = np.NaN

In [18]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


## Iteration 0 :

In [19]:
# Step 1 - Impute all missing values with 'mean' of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [24]:
df['R&D Spend'].mean()

9.25

In [25]:
df['Administration'].mean()

11.25

In [26]:
df['Marketing Spend'].mean()

29.25

In [27]:
# 0th Iteration
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,9.25,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


## Iteration 1 :

### PART A :-

In [None]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN     # Move from Left to Right Column
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [None]:
# Use (Administration,Marketing Spend) to build a model & use (R&D Spend)for prediction

X = df1.iloc[[0,2,3,4],1:3]    # New Input Features
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [31]:
y = df1.iloc[[0,2,3,4],0]    # New Target
y

21     8.0
2     15.0
14    12.0
44     2.0
Name: R&D Spend, dtype: float64

In [None]:
# Precit the Missing value of col1 using other cols

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))

array([23.14158651])

In [None]:
# Replace the NaN with predicted value 
df1.iloc[1,0] = 23.14
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


### PART B :-

In [None]:
# Remove the col2 imputed value
df1.iloc[3,1] = np.NaN
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


In [None]:
# Use (R&D Spend,Marketing Spend) to build a model & use (Administration) for prediction
X = df1.iloc[[0,1,2,4],[0,2]]  # New Input Features
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.14,20.0
2,15.0,41.0
44,2.0,29.25


In [None]:
y = df1.iloc[[0,1,2,4],1]   # New Target
y

21    15.0
37     5.0
2     10.0
44    15.0
Name: Administration, dtype: float64

In [38]:
# Precit the Missing value of col2 using other cols
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))

array([11.06331285])

In [40]:
# Replace the NaN with predicted value 
df1.iloc[3,1] = 11.06
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


### PART C :-

In [42]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.NaN
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,


In [None]:
# Use (R&D Spend,Administration) to build a model & use (Marketing Spend) for prediction
X = df1.iloc[0:4,0:2]   # New Input Features
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [None]:
y = df1.iloc[0:4,-1]   # New Target Variable
y

21    30.0
37    20.0
2     41.0
14    26.0
Name: Marketing Spend, dtype: float64

In [None]:
# Precit the Missing value of col3 using other cols
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))

array([31.56351448])

In [47]:
# Replace the NaN with predicted value 
df1.iloc[4,-1] = 31.56
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [None]:
# After 1st Iteration 
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [49]:
# Subtract 0th iteration from 1st iteration
df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.19,0.0
44,0.0,0.0,2.31


## Iteration 2 :

### PART A :-

In [50]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [None]:
X = df2.iloc[[0,2,3,4],1:3]   # Input Features
y = df2.iloc[[0,2,3,4],0]     # Traget Variable

X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.06,26.0
44,15.0,31.56


In [56]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))

array([23.78627207])

In [57]:
df2.iloc[1,0] = 23.78
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


### PART B :-

In [59]:
df2.iloc[3,1] = np.NaN
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,31.56


In [None]:
X = df2.iloc[[0,1,2,4],[0,2]]   # Input Features
y = df2.iloc[[0,1,2,4],1]       # Target Variable
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.78,20.0
2,15.0,41.0
44,2.0,31.56


In [None]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))

In [61]:
df2.iloc[3,1] = 11.22
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


### PART C :-

In [62]:
df2.iloc[4,-1] = np.NaN
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,


In [63]:
X = df1.iloc[0:4,0:2]   # New Input Features
y = df1.iloc[0:4,-1]   # New Target Variable
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [64]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))

array([31.56351448])

In [65]:
df2.iloc[4,-1] = 31.56
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [66]:
# After 2nd Iteration :
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [67]:
# Subtract 1st iteration from 2nd iteration
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.64,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.16,0.0
44,0.0,0.0,0.0


## Iteration 3 :

### PART A :-

In [68]:
df3 = df2.copy()

df3.iloc[1,0] = np.NaN
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [None]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))

array([24.57698058])

In [70]:
df3.iloc[1,0] = 24.57
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,24.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


### PART B :

In [71]:
df3.iloc[3,1] = np.NaN
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))

array([11.37282844])

In [72]:
df3.iloc[3,1] = 11.37
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,24.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.37,26.0
44,2.0,15.0,31.56


### PART C :-

In [73]:
df3.iloc[4,-1] = np.NaN

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))

array([45.53976417])

In [74]:
df3.iloc[4,-1] = 45.53
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,24.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.37,26.0
44,2.0,15.0,45.53


In [75]:
# After 3rd Iteration :
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,24.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.37,26.0
44,2.0,15.0,45.53


In [76]:
# Subtract 2nd Iteration from 3rd Iteration
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.79,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.15,0.0
44,0.0,0.0,13.97


### NOTE :- 

Repeat the process until the difference between iteration is `0` or almost 0.

## MICE / Iterative Imputer :-

`MICE` => Multivariate Imputation by Chained Equation

---

`ASSUMPTION` : The Data should be Missing at Random (MAR)

`PROS :` Performance is Good (Accurate) 

`CONS :` Slow Method , High Memory Usage