# Pandas Missing Values

## Imports and data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(np.ones((10,3)), index=list('abcdefghij'), columns=['one', 'two', 'three'])
df.loc['a','one':'two'] = np.nan
df.loc['c','one'] = np.nan
df.loc['d','two'] = np.nan
df.loc['e','three'] = np.nan
df.loc['f',:] = np.nan
df.loc['g','one':'two'] = np.nan
df.loc['h', 'two':'three'] = np.nan
df['state'] = ['CA', '', None, 'OR', 'WA', None, '', 'WA', 'OR', None]

Here is the `DataFrame` we will work with:

In [3]:
df

Unnamed: 0,one,two,three,state
a,,,1.0,CA
b,1.0,1.0,1.0,
c,,1.0,1.0,
d,1.0,,1.0,OR
e,1.0,1.0,,WA
f,,,,
g,,,1.0,
h,1.0,,,WA
i,1.0,1.0,1.0,OR
j,1.0,1.0,1.0,


Replace empty strings in the `state` column by `None`, so that its missing values are handled in a consistent manner:

In [4]:
#df.fillna?

In [5]:
df.replace({'':None, None:None}, None, True)

In [6]:
assert '' not in df.state.unique()
assert df.loc['b','state'] is None
assert df.loc['g','state'] is None

Create a new `DataFrame`, named `df2`, that has all rows with any missing values dropped:

In [7]:
df2 = df.dropna()

In [8]:
assert len(df2)==1
assert 'i' in df2.index

Create a new `DataFrame`, named `df3`, from `df1` by dropping rows that have only missing values:

In [9]:
def listNullRow(df):
    dft = df.isnull()
    L = []
    for row in dft.index:
        noVal = True
        for col in dft.loc[row]:
            if col:
                noVal = False
                break
        if noVal:
            L.append(row)
    return L

In [10]:
def dropNullRow(df):
    L = listNullRow(df)
    for index in L:
        dft = df.drop(index)
    return dft

In [11]:
df3 = dropNullRow(df)
#df3

In [12]:
df3 = df
#df

In [13]:
#simple alternative w/ list comprehensions
dft = df.isnull()
L = [row for row in dft.index if dft.loc[row].any() == False]
df3 = df.drop(L)
df3

Unnamed: 0,one,two,three,state
a,,,1.0,CA
b,1.0,1.0,1.0,
c,,1.0,1.0,
d,1.0,,1.0,OR
e,1.0,1.0,,WA
f,,,,
g,,,1.0,
h,1.0,,,WA
j,1.0,1.0,1.0,


In [14]:
assert len(df3)==9

Create a new `DataFrame`, named `df4`, from `df1` that has all columns with fewer than 6 actual values dropped:

In [15]:
def cmp_missing(df, n):
    dft = df.isnull()
    L = []
    for i in dft.columns:
        count = 0
        for j in dft[i]:
            if j == False:
                count += 1
        if count < n + 1:
            L.append(i)
    return L

In [16]:
L4 = cmp_missing(df, 6)
df4 = df.drop(L4, axis=1)
#df4

In [17]:
assert list(df4.columns)==['three']

Create a new `DataFrame`, named `df5`, from `df1` that has only the numerical columns, with missing values replace by the number -9.

In [18]:
df

Unnamed: 0,one,two,three,state
a,,,1.0,CA
b,1.0,1.0,1.0,
c,,1.0,1.0,
d,1.0,,1.0,OR
e,1.0,1.0,,WA
f,,,,
g,,,1.0,
h,1.0,,,WA
i,1.0,1.0,1.0,OR
j,1.0,1.0,1.0,


In [19]:
L = [i for i in df.columns if df[i].dtypes == 'object']

In [20]:
L

['state']

In [21]:
df5 = df.drop(L, axis=1)
#df5

In [22]:
df5 = df5.fillna(-9)
#df5

In [23]:
assert list(df5.columns)==['one','two','three']
sums = df5.sum()
assert sums['one']==-30.0
assert sums['two']==-40.0
assert sums['three']==-20.0

Write a function `count_null` that takes a `Series` and return an integer valued count of the number of null values in the `Series`:

In [24]:
def count_null(column):
    """Count the number of missing values in a column (Series)."""
    colt = column.notnull()
    count = 0
    for i in colt:
        if i == False:
            count += 1
    return count

In [25]:
count_null(df.one)

4

In [26]:
df.one.isnull()

a     True
b    False
c     True
d    False
e    False
f     True
g     True
h    False
i    False
j    False
Name: one, dtype: bool

In [27]:
assert count_null(df.one)==4
assert count_null(df.two)==5
assert count_null(df.three)==3
assert count_null(df.state)==5