In [1]:
# importing pandas as pd 
import pandas as pd 
  
# importing numpy as np 
import numpy as np 

## create a DataFrame

In [2]:
# dictionary of lists 
data_dict = {'First':[100, 90, np.nan, 95], 
        'Second': [30, 45, 56, np.nan], 
        'Third':[np.nan, 40, 80, 98]} 
  
# creating a dataframe from list 
data_df = pd.DataFrame(data_dict) 

In [3]:
# see some lines
data_df.head()

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [4]:
# size
data_df.shape

(4, 3)

## Detecting missing values 

## a values in Panada DataFrame that is `NaN` represents missing values

In [5]:
# test if a values is missing in all columns
# isna() returns True if the value of the cell NaN. False if the value is not missing
data_df.isna()

Unnamed: 0,First,Second,Third
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [6]:
# test if a values is missing in all columns
# notna() returns False if the value of the cell NaN. True if the value is not missing
data_df.notna()

Unnamed: 0,First,Second,Third
0,True,True,False
1,True,True,True
2,False,True,True
3,True,False,True


In [7]:
data_df['First'].notna()

0     True
1     True
2    False
3     True
Name: First, dtype: bool

In [8]:
#Cols wise
# test if any value of every column is NaN
# any() return True if any value of the column is True
#A pandas Series is a one-dimensional labelled data structure 
#which can hold data such as strings, integers and even other Python objects.
#the primary data structure to hold one-dimensional data in pandas.
data_df.isna().any()

First     True
Second    True
Third     True
dtype: bool

In [9]:
df2=pd.DataFrame(data_df.isna().any())

In [10]:
df2

Unnamed: 0,0
First,True
Second,True
Third,True


In [11]:
# if any of the values is True then `any` returns True
# if all of the values are False then `any` returns False
pd.Series([False,False]).any()

False

In [12]:
pd.Series([True,False]).any()

True

# Filtering data with missing values

In [13]:
data_df

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [14]:
data_df['First'].isna()

0    False
1    False
2     True
3    False
Name: First, dtype: bool

In [15]:
#condition
data_df[data_df['First'].isna()]

Unnamed: 0,First,Second,Third
2,,56.0,80.0


In [16]:
# get data with NO null
data_df[~data_df['First'].isna()]

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
3,95.0,,98.0


In [17]:
# get data with NO null
data_df[data_df['First'].notna()]

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
3,95.0,,98.0


## drop rows with `NaN`

In [18]:
# drop every row if it has at least one `NaN`
data_df.dropna(axis=0, how='any')

Unnamed: 0,First,Second,Third
1,90.0,45.0,40.0


In [19]:
data_df.drop(columns=['First'])

Unnamed: 0,Second,Third
0,30.0,
1,45.0,40.0
2,56.0,80.0
3,,98.0


In [20]:
clean_df = data_df.dropna(axis=0, how='any')
clean_df

Unnamed: 0,First,Second,Third
1,90.0,45.0,40.0


In [21]:
data_df

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


## drop columns with `NaN`

In [22]:
# this will return a new data frame with columns has no missing
# note as our data frame has missing in all columns then this method will not return
# any clean column
data_df.dropna(axis=1, how='any')

0
1
2
3


## drop if all values of rows/columns are `NaN`

In [23]:
# using dropna() function     
data_df.dropna(how = 'all') 

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [24]:
# add a new column to have all `NaN`
data_df['Fourth'] = np.nan
data_df

Unnamed: 0,First,Second,Third,Fourth
0,100.0,30.0,,
1,90.0,45.0,40.0,
2,,56.0,80.0,
3,95.0,,98.0,


In [25]:
# using dropna() function     
data_df = data_df.dropna(axis=1, how = 'all') 
data_df

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


## compute percentage of  `NaN` in every column

In [26]:
data_df.isna().sum()

First     1
Second    1
Third     1
dtype: int64

In [27]:
# this will return a new DataFrame with clean no missing data
percent_missing = data_df.isna().sum() / data_df.shape[0]
percent_missing

First     0.25
Second    0.25
Third     0.25
dtype: float64

In [28]:
data_df.drop(columns=['First'])

Unnamed: 0,Second,Third
0,30.0,
1,45.0,40.0
2,56.0,80.0
3,,98.0


## Fill missing values with constant

In [29]:
# fill all columns
data_df.fillna(-1)

Unnamed: 0,First,Second,Third
0,100.0,30.0,-1.0
1,90.0,45.0,40.0
2,-1.0,56.0,80.0
3,95.0,-1.0,98.0


In [30]:
# fill a column
data_df['First'].fillna('First Missing')

0            100.0
1             90.0
2    First Missing
3             95.0
Name: First, dtype: object

# note that fillna doesn't change the object. it returns a new dataframe/series with the updated filled values

In [31]:
data_df

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [32]:
data_df_filled = data_df.fillna(-1)
data_df_filled

Unnamed: 0,First,Second,Third
0,100.0,30.0,-1.0
1,90.0,45.0,40.0
2,-1.0,56.0,80.0
3,95.0,-1.0,98.0


In [33]:
# still original dataframe not changed
data_df

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


## you can use the `inplace` command to update the same dataframe.

In [34]:
# data_df.fillna(-1, inplace=True)
# this will mutate the data_df

## fill by previous/next value

In [35]:
data_df

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [36]:
# fill by using previous value. Forward filling
# notice that the first value cannot be handled using this method as 
# there is no previous value for it
data_df.fillna(method='ffill')

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,90.0,56.0,80.0
3,95.0,56.0,98.0


In [37]:
# fill by using next value. Backward filling
# notice that the last value cannot be handled using this method as 
# there is no next value to it
data_df.fillna(method='bfill')

Unnamed: 0,First,Second,Third
0,100.0,30.0,40.0
1,90.0,45.0,40.0
2,95.0,56.0,80.0
3,95.0,,98.0


In [38]:
data_df.fillna(method='bfill').fillna(method='ffill')

Unnamed: 0,First,Second,Third
0,100.0,30.0,40.0
1,90.0,45.0,40.0
2,95.0,56.0,80.0
3,95.0,56.0,98.0


## handle missing values for columns differently

In [39]:
# copy the data
data_copy = data_df.copy()
data_copy

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [40]:
# First column
data_copy['First'].fillna(-1, inplace=True)
data_copy['Second'].fillna(method='bfill', inplace=True)
data_copy['Third'].fillna('Third', inplace=True)
data_copy

Unnamed: 0,First,Second,Third
0,100.0,30.0,Third
1,90.0,45.0,40.0
2,-1.0,56.0,80.0
3,95.0,,98.0


## Fill missing values using interpolation

In [41]:
data_df

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,,56.0,80.0
3,95.0,,98.0


In [42]:
# note that the third value of the first column becomes 92.5 which is the average of 90 and 95
data_df.interpolate(method ='linear',limit_direction="forward") 

Unnamed: 0,First,Second,Third
0,100.0,30.0,
1,90.0,45.0,40.0
2,92.5,56.0,80.0
3,95.0,56.0,98.0


## Fill missing values by the mean

In [None]:
data_df

In [None]:
# the mean will compute the missing.
# it doesn't consider `NaN` values
data_df.mean()

In [None]:
type(data_df.mean())

In [None]:
data_df.fillna(data_df.mean())

In [None]:
# you can round it for better vis
data_df.fillna(data_df.mean()).round(2)

## fill missing with most frequent

In [None]:
df = pd.DataFrame([('bird', 2, 2),
                   (np.nan, 4, 2),
                   ('arthropod', 8, 0),
                   ('bird', 2, np.nan)],
                  columns=('species', 'legs', 'wings'))

In [None]:
df

In [None]:
# get the mode of evey column
df.mode()

In [None]:
type(df.mode())

In [None]:
type(df.mode().iloc[-1])

## note the the mode is a DataFrame

In [None]:
# we need to take the last row of the mode DataFrame to fill missing values
df.fillna(df.mode().iloc[-1])

# Fill missing values using advance techniques
# k-NN algorithm

In [None]:
#knn k-nearest neighbor

In [None]:
df = pd.DataFrame([(1, 5, 3),
                    (2, 8, 0),
                   (np.nan, 5, 3),
                   (2, 8, np.nan)],
                  columns=('species', 'legs', 'wings'))

In [None]:
df

In [None]:
from sklearn.impute import KNNImputer

In [None]:
model = KNNImputer(n_neighbors=1)

In [None]:
model.fit_transform(df)

In [None]:
type(model.fit_transform(df))

In [None]:
model.fit_transform(df).shape

In [None]:
df = pd.DataFrame(data = model.fit_transform(df), columns=['species','legs','wings'])

In [None]:
df