# 7.1 Handling Missing data


For Numeric data, Pandas uses the floating-point value NaN(Not a number) to represent missing data. It is a _senital value_


In [5]:
import pandas as pd
import numpy as np
string_data=pd.Series(['aardvark','articoke',np.nan,'avocado'])
print(string_data)
print("\n",string_data.isnull())

0    aardvark
1    articoke
2         NaN
3     avocado
dtype: object

 0    False
1    False
2     True
3    False
dtype: bool


## 7.1.2 Filtering out missing Data


In [7]:
from numpy import nan as NA
data=pd.Series([1,NA,3.5,NA,7])
data.dropna()
# this is equivalent to 
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [None]:
data=pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
 [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna() # removes all rows having NaN values from dataFrames
data


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [11]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Passing _how ='all'_ will drop rows that are all NA


In [None]:
data.dropna(how='all',inplace=True)
data # removes all the rows that have all NaN values 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [20]:
data = pd.DataFrame([[1., 6.5, NA], [1., 0.2, NA],
 [5., NA, NA], [3., 6.5, NA]])
data


Unnamed: 0,0,1,2
0,1.0,6.5,
1,1.0,0.2,
2,5.0,,
3,3.0,6.5,


In [23]:
data.dropna(how='all',axis=1,inplace=True) # removes the columns where all values are NaN
data

Unnamed: 0,0,1
0,1.0,6.5
1,1.0,0.2
2,5.0,
3,3.0,6.5


In [4]:
import pandas as pd
import numpy as np
from numpy import nan as NA
df=pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1]=NA
df.iloc[:2,2]=NA
df

Unnamed: 0,0,1,2
0,2.285329,,
1,0.797298,,
2,2.852005,,0.488233
3,-1.092464,,1.015048
4,1.753886,-0.543476,-0.433276
5,0.473905,-1.184073,-2.643359
6,-0.53539,-1.252514,0.179953


In [5]:
df.dropna()

Unnamed: 0,0,1,2
4,1.753886,-0.543476,-0.433276
5,0.473905,-1.184073,-2.643359
6,-0.53539,-1.252514,0.179953


## 7.1.2 Filling in Missing Data


In [6]:
df.fillna(0)

Unnamed: 0,0,1,2
0,2.285329,0.0,0.0
1,0.797298,0.0,0.0
2,2.852005,0.0,0.488233
3,-1.092464,0.0,1.015048
4,1.753886,-0.543476,-0.433276
5,0.473905,-1.184073,-2.643359
6,-0.53539,-1.252514,0.179953


### Forward filling OF Data in Null places


In [6]:

import pandas as pd
import numpy as np
from numpy import nan as NA
# Forward Filling of NA values
data = {
    'A': [1, None, None, 4, None, 6],
}
df = pd.DataFrame(data)
print("Before fillna:")
print(df)

df.fillna(method='ffill', inplace=True)
print("\nAfter fillna(method='ffill'):")
print(df)

Before fillna:
     A
0  1.0
1  NaN
2  NaN
3  4.0
4  NaN
5  6.0

After fillna(method='ffill'):
     A
0  1.0
1  1.0
2  1.0
3  4.0
4  4.0
5  6.0


  df.fillna(method='ffill', inplace=True)


In [None]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill', limit=2) # Changes only 2 values

  df.fillna(method='ffill', limit=2)


Unnamed: 0,0,1,2
0,-0.787039,-0.861225,1.137872
1,0.377159,-0.320538,-0.299197
2,-0.901053,-0.320538,-0.158977
3,0.003285,-0.320538,-0.618229
4,1.544515,,-0.618229
5,0.411932,,-0.618229


In [None]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

# 7.2 Data Transformation


### 7.2.1 Removing Duplicates


In [10]:
data=pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
              'k2': [1, 1, 2, 3, 3, 4, 4]})
data['v1'] = range(7)
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [11]:
#The combination ('two', 4) appears at index 5 and index 6.By default, Pandas would keep the first (index 5).But since you wrote keep='last', it keeps index 6 instead and drops index 5.

data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6
