### Importing Packages and Load Data

In [34]:
import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LinearRegression 
from sklearn.impute import IterativeImputer

In [2]:
data = np.round(pd.read_csv('50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)

In [3]:
np.random.seed(9)
data = data.sample(5)

In [4]:
data.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
21,8.0,15.0,30.0,11.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


In [5]:
data = data.iloc[:,0:-1]

In [6]:
data.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0
2,15.0,10.0,41.0
37,4.0,5.0,20.0


In [7]:
data.iloc[1,0] = np.nan
data.iloc[3,1] = np.nan
data.iloc[-1,-1] = np.nan

In [8]:
data.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
2,15.0,10.0,41.0
37,,5.0,20.0
14,12.0,,26.0
44,2.0,15.0,


### Iterative Imputation using Pandas

In [9]:
data0 = pd.DataFrame()

data0['R&D Spend'] = data['R&D Spend'].fillna(data['R&D Spend'].mean())
data0['Administration'] = data['Administration'].fillna(data['Administration'].mean())
data0['Marketing Spend'] = data['Marketing Spend'].fillna(data['Marketing Spend'].mean())

In [10]:
data0.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,9.25,5.0,20.0
44,2.0,15.0,29.25
21,8.0,15.0,30.0
14,12.0,11.25,26.0
2,15.0,10.0,41.0


In [11]:
data1 = data0.copy()

data1.iloc[1,0] = np.nan

In [12]:
data1.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend
14,12.0,11.25,26.0
21,8.0,15.0,30.0
44,2.0,15.0,29.25
37,,5.0,20.0
2,15.0,10.0,41.0


In [13]:
X = data1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [14]:
y = data.iloc[[0,2,3,4],0]
y

21     8.0
2     15.0
14    12.0
44     2.0
Name: R&D Spend, dtype: float64

In [15]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(data1.iloc[1,1:].values.reshape(1,2))



array([23.14158651])

In [16]:
data1.iloc[1,0] = 23.14

In [17]:
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [18]:
data1.iloc[3,1] = np.nan

data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


In [19]:
# Use last 3 rows to build a model and use the first for prediction
X = data1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.14,20.0
2,15.0,41.0
44,2.0,29.25


In [20]:
y = data1.iloc[[0,1,2,4],1]
y

21    15.0
37     5.0
2     10.0
44    15.0
Name: Administration, dtype: float64

In [21]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(data1.iloc[3,[0,2]].values.reshape(1,2))



array([11.06331285])

In [22]:
data1.iloc[3,1] = 11.06

In [23]:
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


In [24]:
data.iloc[4,-1] = np.nan

In [25]:
data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


In [26]:
X = data1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [27]:
y = data1.iloc[0:4,-1]
y

21    30.0
37    20.0
2     41.0
14    26.0
Name: Marketing Spend, dtype: float64

In [29]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(data1.iloc[4,0:2].values.reshape(1,2))



array([31.56351448])

In [30]:
data1.iloc[4,-1] = 31.56

In [32]:
data1 - data0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,13.89,0.0,0.0
2,0.0,0.0,0.0
14,0.0,-0.19,0.0
44,0.0,0.0,2.31


### Iterative Imputation using SkLearn

In [41]:
imputer = IterativeImputer(max_iter=10, random_state=2)
X_imputed = imputer.fit_transform(data)

In [42]:
data

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


In [43]:
X_imputed

array([[ 8.        , 15.        , 30.        ],
       [10.7122223 ,  5.        , 20.        ],
       [15.        , 10.        , 41.        ],
       [12.        ,  6.3280627 , 26.        ],
       [ 2.        , 15.        , 12.99045334]])