In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv('weatherAUS.csv')
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [3]:
df.isna().mean()*100

Date              0.000000
Location          0.000000
MinTemp           1.020899
MaxTemp           0.866905
Rainfall          2.241853
Evaporation      43.166506
Sunshine         48.009762
WindGustDir       7.098859
WindGustSpeed     7.055548
WindDir9am        7.263853
WindDir3pm        2.906641
WindSpeed9am      1.214767
WindSpeed3pm      2.105046
Humidity9am       1.824557
Humidity3pm       3.098446
Pressure9am      10.356799
Pressure3pm      10.331363
Cloud9am         38.421559
Cloud3pm         40.807095
Temp9am           1.214767
Temp3pm           2.481094
RainToday         2.241853
RainTomorrow      2.245978
dtype: float64

In [4]:
## Usage of Iterative Imputer --> Takes into relation of a feature with different features
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [5]:
alpha = {
    'A': [1, 3, 4, np.nan, 7],
    'B': [2, 6, 8, 3, np.nan],
    'C': [np.nan, 5, 7, 2, 9],
    'D': [4, np.nan, 1, 8, 2],
    'E': [7, 9, np.nan, 4, 6]
}

df = pd.DataFrame(alpha)
df

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,,4.0,7.0
1,3.0,6.0,5.0,,9.0
2,4.0,8.0,7.0,1.0,
3,,3.0,2.0,8.0,4.0
4,7.0,,9.0,2.0,6.0


In [6]:
# Usage of iterative imputer now
imp = IterativeImputer(max_iter = 10, random_state = 0)
# max_iter defines number of iterations done for imputing the values
itImpDf =  pd.DataFrame(imp.fit_transform(df), columns = df.columns)       
print("The new imputed dataframe now is: \n", itImpDf)
#  models each feature with missing values as a function of other features, and uses that estimate for imputation.

The new imputed dataframe now is: 
           A         B         C         D         E
0  1.000000  2.000000  2.694136  4.000000  7.000000
1  3.000000  6.000000  5.000000 -0.876619  9.000000
2  4.000000  8.000000  7.000000  1.000000  6.853369
3 -1.936319  3.000000  2.000000  8.000000  4.000000
4  7.000000  7.584799  9.000000  2.000000  6.000000


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv('train2.csv',usecols=['Age','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [9]:
df.isnull().mean() * 100

Survived     0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [10]:
x = df.drop(columns=['Survived'])
print(x.head())
y = df['Survived']
y

    Age     Fare
0  22.0   7.2500
1  38.0  71.2833
2  26.0   7.9250
3  35.0  53.1000
4  35.0   8.0500


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [11]:
x['Age Imputed'] = x['Age']
x

Unnamed: 0,Age,Fare,Age Imputed
0,22.0,7.2500,22.0
1,38.0,71.2833,38.0
2,26.0,7.9250,26.0
3,35.0,53.1000,35.0
4,35.0,8.0500,35.0
...,...,...,...
886,27.0,13.0000,27.0
887,19.0,30.0000,19.0
888,,23.4500,
889,26.0,30.0000,26.0


In [12]:
# Now using the concept of random imputation it is done by pandas only
x['Age Imputed'][x['Age Imputed'].isnull()] = x['Age'].dropna().sample((x['Age'].isnull().sum() )).values

In [13]:
x.drop(columns='Age', inplace= True)
x

Unnamed: 0,Fare,Age Imputed
0,7.2500,22.0
1,71.2833,38.0
2,7.9250,26.0
3,53.1000,35.0
4,8.0500,35.0
...,...,...
886,13.0000,27.0
887,30.0000,19.0
888,23.4500,24.0
889,30.0000,26.0


In [15]:
# This is usage of Missing Indicator which marks whether the value is missing or not
from sklearn.impute import MissingIndicator
X = np.array([[-1, -1, 1, 3],
              [4, -1, 0, -1],
              [8, -1, 1, 0]])
indicator = MissingIndicator(missing_values=-1)
mask_missing_values_only = indicator.fit_transform(X)
mask_missing_values_only

array([[ True,  True, False],
       [False,  True,  True],
       [False,  True, False]])

In [16]:
## Usage of KNN imputer
data2 = pd.read_csv('train2.csv')[['Age','Pclass','Fare','Survived']]

In [17]:
## This technique is more accurate 
# but is less efficient as more operations are required
data2.head()

Unnamed: 0,Age,Pclass,Fare,Survived
0,22.0,3,7.25,0
1,38.0,1,71.2833,1
2,26.0,3,7.925,1
3,35.0,1,53.1,1
4,35.0,3,8.05,0


In [18]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [25]:
## Now applying KNN imputer --> This is the best impuatation technique
from sklearn.impute import KNNImputer,SimpleImputer
knn = KNNImputer(n_neighbors=3,weights='distance')
x_new = knn.fit_transform(X)
# Here take n_neighbours to be the minimum value where accuracy is maximum, it is hit and try

In [26]:
newDf = pd.DataFrame(x_new, columns = X.columns)
newDf

Unnamed: 0,Age,Fare
0,22.000000,7.2500
1,38.000000,71.2833
2,26.000000,7.9250
3,35.000000,53.1000
4,35.000000,8.0500
...,...,...
886,27.000000,13.0000
887,19.000000,30.0000
888,18.666667,23.4500
889,26.000000,30.0000


In [None]:
### Now using Multivariate Imputation