In [2]:
import numpy as np
import pandas as pd

In [13]:
# NaN is a numpy value for not a number and will cause calculations to not be doable. We will want to filter these out or fix them
# can filter them out with ~np.isnan, or np.isfinite which will also detect np.inf
example = np.array([1, 2, 3, np.nan, np.nan, 4])
example[~np.isnan(example)]


array([1., 2., 3., 4.])

In [6]:
# We can also use pandas it detect null values with isnull or isna
pd.isnull(np.nan)

True

In [15]:
# This can also work on Series and dataframes!
pd.isnull(pd.DataFrame({
    'Column A': [1, np.nan, 7],
    'Column B': [np.nan, 2, 3],
    'Column C': [np.nan, 2, np.nan]
}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


In [18]:
# Pandas will ignore null items when doing calculations, unlike numpy
pd.Series([1, 2, np.nan]).count()

2

In [19]:
# if we want to simply get rid of any null values we can use pandas to drop them
s = pd.Series([1, 2, 3, np.nan, np.nan, 4])
pd.isnull(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [20]:
# This will display the not null values for the series
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [21]:
# This will drop all null values, but s will still have them, this just displays it, do s = s.dropna() to actually change s
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [24]:
null_df = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})

In [26]:
# when using info on a dataframe can give you information on how many nulls there are, can be helpful if also looking at the other information given too
null_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [None]:
# or you cna just count up the null values if only looking for nulls
null_df.isnull().sum()
