# 🧨1. Handling Missing Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
string_data = pd.Series(['A','Axe',np.nan,None])
string_data

0       A
1     Axe
2     NaN
3    None
dtype: object

In [None]:
#if we want to see 'Nan' value in boolean(True if there exist nan) 
#we have to call *isna()*
string_data.isna()

0    False
1    False
2     True
3     True
dtype: bool

In [None]:
#Same case in float data type also
Float_data = pd.Series([10,2,30,None],dtype='float64')
Float_data

0    10.0
1     2.0
2    30.0
3     NaN
dtype: float64

In [None]:
#use of "isna" in float num also 
Float_data.isna()

0    False
1    False
2    False
3     True
dtype: bool

#     🧲1.1 Filtering Out Missing Data

In [17]:
Data_1 = pd.Series([1,np.nan,None,10,20])
Data_1

0     1.0
1     NaN
2     NaN
3    10.0
4    20.0
dtype: float64

In [18]:
#if we want to delete 'Nan" value
#we have to use *dropna()*
Data_1.dropna()

0     1.0
3    10.0
4    20.0
dtype: float64

In [27]:
#Data in tabular form 
Data= pd.DataFrame([[1,2,None],[33,3,4],[10,1,None],[None,None,None]],columns=['a','b','c'])
Data

Unnamed: 0,a,b,c
0,1.0,2.0,
1,33.0,3.0,4.0
2,10.0,1.0,
3,,,


In [28]:
Data.dropna()

Unnamed: 0,a,b,c
1,33.0,3.0,4.0


In [30]:
#how="all"  
#it delete that row where all value is 'Nan'
Data.dropna(how='all')

Unnamed: 0,a,b,c
0,1.0,2.0,
1,33.0,3.0,4.0
2,10.0,1.0,


In [32]:
Data['d'] = np.nan
Data

Unnamed: 0,a,b,c,d
0,1.0,2.0,,
1,33.0,3.0,4.0,
2,10.0,1.0,,
3,,,,


In [35]:
#axis="colums"
#we can delete data from colums 
Data.dropna(axis='columns',how='all')

Unnamed: 0,a,b,c
0,1.0,2.0,
1,33.0,3.0,4.0
2,10.0,1.0,
3,,,


In [39]:
DF = pd.DataFrame(np.random.standard_normal((7,3)))
DF

Unnamed: 0,0,1,2
0,-0.284996,-0.246761,-1.589196
1,-0.796725,1.045306,-0.988231
2,0.863336,2.304859,1.607592
3,0.516101,-0.58742,-0.465126
4,-0.898134,0.10482,0.375416
5,-1.298915,0.153315,-1.352704
6,-0.608071,-1.100281,-0.992892


In [43]:
#convert into 'NaN' value 
#by using 'iloc' slicing
#(:2 is row) and (1 is column here)
DF.iloc[:2,1]=np.nan
DF.iloc[4:5,2]=np.nan
DF

Unnamed: 0,0,1,2
0,-0.284996,,-1.589196
1,-0.796725,,-0.988231
2,0.863336,2.304859,1.607592
3,0.516101,-0.58742,-0.465126
4,-0.898134,0.10482,
5,-1.298915,0.153315,-1.352704
6,-0.608071,-1.100281,-0.992892


In [44]:
DF.dropna()

Unnamed: 0,0,1,2
2,0.863336,2.304859,1.607592
3,0.516101,-0.58742,-0.465126
5,-1.298915,0.153315,-1.352704
6,-0.608071,-1.100281,-0.992892


In [47]:
DF.dropna(thresh=3)

Unnamed: 0,0,1,2
2,0.863336,2.304859,1.607592
3,0.516101,-0.58742,-0.465126
5,-1.298915,0.153315,-1.352704
6,-0.608071,-1.100281,-0.992892



# 🧲1.2 Filling In Missing Data

In [49]:
#here we need statistics terms for fill the nan value 
DF.fillna(0)

Unnamed: 0,0,1,2
0,-0.284996,0.0,-1.589196
1,-0.796725,0.0,-0.988231
2,0.863336,2.304859,1.607592
3,0.516101,-0.58742,-0.465126
4,-0.898134,0.10482,0.0
5,-1.298915,0.153315,-1.352704
6,-0.608071,-1.100281,-0.992892


In [51]:
DF.iloc[:2,1]=np.nan
DF.iloc[4:5,2]=np.nan
DF

Unnamed: 0,0,1,2
0,-0.284996,,-1.589196
1,-0.796725,,-0.988231
2,0.863336,2.304859,1.607592
3,0.516101,-0.58742,-0.465126
4,-0.898134,0.10482,
5,-1.298915,0.153315,-1.352704
6,-0.608071,-1.100281,-0.992892


In [55]:
#fill the value by using dictionary for each columns 
#Check previous dataframe and this dataframe ,you can see the changes 
DF.fillna({1:3,2:1})

Unnamed: 0,0,1,2
0,-0.284996,3.0,-1.589196
1,-0.796725,3.0,-0.988231
2,0.863336,2.304859,1.607592
3,0.516101,-0.58742,-0.465126
4,-0.898134,0.10482,1.0
5,-1.298915,0.153315,-1.352704
6,-0.608071,-1.100281,-0.992892


In [58]:
#method='ffill'
#this method is use to replace nan value by up row value 
#row 4 x column 2 ,you can see the change 
DF.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.284996,,-1.589196
1,-0.796725,,-0.988231
2,0.863336,2.304859,1.607592
3,0.516101,-0.58742,-0.465126
4,-0.898134,0.10482,-0.465126
5,-1.298915,0.153315,-1.352704
6,-0.608071,-1.100281,-0.992892
