####  import the packages:

In [1]:
import pandas as pd
import numpy as np

#### create dictionary:

In [2]:
dict1={'Names':['Aravind','Samar',np.nan,'Siri'],
       'Age':[np.nan,21,32,43],
       'City':['Hyd','Blr','Chennai',np.nan]}
dict1
# np.nan means not a number

{'Names': ['Aravind', 'Samar', nan, 'Siri'],
 'Age': [nan, 21, 32, 43],
 'City': ['Hyd', 'Blr', 'Chennai', nan]}

#### To convert dictionary to DataFrame:

In [6]:
pd.DataFrame(dict1)

Unnamed: 0,Names,Age,City
0,Aravind,,Hyd
1,Samar,21.0,Blr
2,,32.0,Chennai
3,Siri,43.0,


In [7]:
d1=pd.DataFrame(dict1)
d1.dtypes     # returns datatype of each column in the dataframe

Names     object
Age      float64
City      object
dtype: object

In [8]:
# np.nan is the culprit, changed np.nan to None
dict2={'Names':['Aravind','Samar',None,'Siri'],
       'Age':[np.nan,21,32,43],
       'City':['Hyd','Blr','Chennai',None]}
pd.DataFrame(dict2)

Unnamed: 0,Names,Age,City
0,Aravind,,Hyd
1,Samar,21.0,Blr
2,,32.0,Chennai
3,Siri,43.0,


In [10]:
d1.isnull()   # returns True in place of None/np.nan, at other places returns False

Unnamed: 0,Names,Age,City
0,False,True,False
1,False,False,False
2,True,False,False
3,False,False,True


In [12]:
d1.isnull().sum()   # every column has value 1

Names    1
Age      1
City     1
dtype: int64

In [14]:
# How much percentage of data missed?

d1.isnull().sum()*100/len(d1)

Names    25.0
Age      25.0
City     25.0
dtype: float64

- np.nan: not a number, this applicable for numerical columns only
- Generally data has some empty rows
- It is a data problem
- If you read that kind of data you will see Null
- You need to understand it is a data corrupted or really a null values are there

#### Method-1:
#### Fill with some random values
#### Method name: fillna

In [None]:
# Every null value is filling with 40 in the below problem

In [15]:
d1.fillna(40)   # null values fill with 40 value in the d1 dataframe

Unnamed: 0,Names,Age,City
0,Aravind,40.0,Hyd
1,Samar,21.0,Blr
2,40,32.0,Chennai
3,Siri,43.0,40


#### Method-2
- We can fill the values with respect to columns also

In [17]:
d1['Age'].fillna(40)
d1
# it will not updated
# because inplace=False

Unnamed: 0,Names,Age,City
0,Aravind,,Hyd
1,Samar,21.0,Blr
2,,32.0,Chennai
3,Siri,43.0,


In [18]:
d1['Age'].fillna(40,inplace=True)
d1
# Age column is updated with 40 inplace of NaN in the DataFrame d1

Unnamed: 0,Names,Age,City
0,Aravind,40.0,Hyd
1,Samar,21.0,Blr
2,,32.0,Chennai
3,Siri,43.0,


In [20]:
d1['Names'].fillna('Anu',inplace=True)
d1   
# Names column is updated with Anu inplace of NaN in the d1

Unnamed: 0,Names,Age,City
0,Aravind,40.0,Hyd
1,Samar,21.0,Blr
2,Anu,32.0,Chennai
3,Siri,43.0,


In [21]:
d1['City'].fillna('Austin',inplace=True)
d1
# City column is updataed with Austin inplace of NaN in the d1

Unnamed: 0,Names,Age,City
0,Aravind,40.0,Hyd
1,Samar,21.0,Blr
2,Anu,32.0,Chennai
3,Siri,43.0,Austin


In [22]:
# we filled all the NaN values with our assigned values in all the columns 
d1

Unnamed: 0,Names,Age,City
0,Aravind,40.0,Hyd
1,Samar,21.0,Blr
2,Anu,32.0,Chennai
3,Siri,43.0,Austin


In [24]:
# Read the data again
dict3={'Names':['Aravind','Samar',np.nan,'Siri'],
       'Age':[np.nan,21,32,43],
       'City':['Hyd','Blr','Chennai',np.nan]}
d3=pd.DataFrame(dict3)
d3

Unnamed: 0,Names,Age,City
0,Aravind,,Hyd
1,Samar,21.0,Blr
2,,32.0,Chennai
3,Siri,43.0,


#### Method-3
- bfill
- ffill
- pad
- backfill

In [26]:
d3.fillna(method='bfill')  # it will fill with below value or next value
# column1: Names index 3 is missing values, it is filled with index 4 value
# column2: Age index 1 is missing value, it is filled with index 2 value
# column3: City index 4 is missing values, it should be filled with next value, but we don't have next value

  d3.fillna(method='bfill')  # it will fill with below value or next value


Unnamed: 0,Names,Age,City
0,Aravind,21.0,Hyd
1,Samar,21.0,Blr
2,Siri,32.0,Chennai
3,Siri,43.0,


In [28]:
d3.fillna(method='bfill',axis=1)
# axis=1 means columns
# so NaN values filled with next column values

  d3.fillna(method='bfill',axis=1)


Unnamed: 0,Names,Age,City
0,Aravind,Hyd,Hyd
1,Samar,21.0,Blr
2,32.0,32.0,Chennai
3,Siri,43.0,


In [29]:
d3.fillna(method='ffill') # it will fill with above value or previous value

  d3.fillna(method='ffill') # it will fill with above value or previous value


Unnamed: 0,Names,Age,City
0,Aravind,,Hyd
1,Samar,21.0,Blr
2,Samar,32.0,Chennai
3,Siri,43.0,Chennai


In [30]:
d3.fillna(method='pad')

  d3.fillna(method='pad')


Unnamed: 0,Names,Age,City
0,Aravind,,Hyd
1,Samar,21.0,Blr
2,Samar,32.0,Chennai
3,Siri,43.0,Chennai


In [31]:
d3.fillna(method='backfill')

  d3.fillna(method='backfill')


Unnamed: 0,Names,Age,City
0,Aravind,21.0,Hyd
1,Samar,21.0,Blr
2,Siri,32.0,Chennai
3,Siri,43.0,


**Note:**
- backfill and bfill fill with below value or next value
- pad and ffill fill with above value or previous value
- but it will change based on axis
- axis did not mentioned means it is 0, so NaN values fill with next or previous row values based on methods
- axis=1 , NaN values fill with next or previous column values based on methods

#### Method-4
- mean
    - Numerical values can fill with mean value
    - but mean affect the outliers
    - If we don't have outliers it is best one
- median
    - Numerical values can fill with median value
    - we know that median value does not affect with outliers
    - so if outliers are there we can go with median
- mode
    - mode is useful for categorical data

In [32]:
# Read the data again
dict3={'Names':['Aravind','Samar',np.nan,'Siri'],
       'Age':[np.nan,21,32,43],
       'City':['Hyd','Blr','Chennai',np.nan]}
d3=pd.DataFrame(dict3)
d3

Unnamed: 0,Names,Age,City
0,Aravind,,Hyd
1,Samar,21.0,Blr
2,,32.0,Chennai
3,Siri,43.0,


In [None]:
# we already know we can fill with some values based on specific column using fill
# d1.fillna(<random number>)
# d1 represents all the columns

In [37]:
age_mean=d3['Age'].mean()
d3['Age'].fillna(age_mean)

0    32.0
1    21.0
2    32.0
3    43.0
Name: Age, dtype: float64

In [42]:
age_median=d3['Age'].median()
d3['Age'].fillna(age_median)

0    32.0
1    21.0
2    32.0
3    43.0
Name: Age, dtype: float64

In [39]:
age_mode=d3['Age'].mode()
d3['Age'].fillna(age_mode)

0    21.0
1    21.0
2    32.0
3    43.0
Name: Age, dtype: float64

In [40]:
d3['City'].mode()
# In this usecase we are getting mode equal, but in real time scenario we will get clear mode value

0        Blr
1    Chennai
2        Hyd
Name: City, dtype: object

- Fill with random value: example: d1.fillna(40)
- Fill with random value on specific value: example: d1['Names'].fillna('Anu',inplace=True)
- Fill with methods: bfill, ffill, pad, backfill: example: d1.fillna(method='bfill')
- Fill with: mean, median, mode: example: age_median=d1['Age'].median()
-                                         d1['Age'].fillna(age_median)

#### Method-5
**KNN Imputer**
- KNN: K nearest neighbors
- K is a hyper parameter means user can choose
- It is distance metric:**Euclidean distance**
- KNN Imputer will take the mean of the neighbors value
- The neighbors value can provided by using value=K
- It is under sklearn package
- Under sklearn we have impute method
- Under impute we have KnnImputer
- Applicable only for numerical data

<img src="https://hlab.stanford.edu/brian/making7.gif" jsaction="VQAsE" class="sFlh5c pT0Scc iPVvYb" style="max-width: 668px; height: 298px; margin: 0px; width: 353px;" alt="Measuring Dis/Similarities" jsname="kn3ccd" data-ilt="1718398207901" aria-hidden="false">

In [44]:
from sklearn.impute import KNNImputer     # import or read the package
KI=KNNImputer()                           # save the package
KI.fit_transform(d3[['Age']])               # Apply fit transform
# we have to reshape the data or use double square brackets

array([[32.],
       [21.],
       [32.],
       [43.]])