<h1> Pandas - Missing Data </h1>

<h2> How to handle missing data in Pandas DataFrames</h2>

In [1]:
import numpy as np
import pandas as pd

In [2]:
d = {'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}     # Create a dictionary with some key values including NaN

In [3]:
d

{'A': [1, 2, nan], 'B': [5, nan, nan], 'C': [1, 2, 3]}

In [6]:
df = pd.DataFrame(d)    # Create a df for the dictonary d

In [7]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


<h3> Drop NA Method</h3>

In [8]:
df.dropna(axis=1)   # Drop all columns with NaN values in them
                    # (Axis = 1 applies to columns)

Unnamed: 0,C
0,1
1,2
2,3


In [9]:
df.dropna()       # Drop all rows with NaN values in them (this is the default)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [10]:
df.dropna(thresh=2)     # Drop any NaN values that are 2 or greater from any row, (using thresh),
                        # This returned row 0 (has non NaN values),
                        # and returned row 1 (has one NaN value).
                        # Row two was not returned because it has 2 NaN values

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


<h3>Fill NA Method</h3>

<h4> Used to replace missing values</h4>

In [11]:
df.fillna(value='FILL Value')   # Fill in NaN values
                                # In this example, use 'FILL Value'
                                # Usage: df.fillna(value='some value')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL Value,2
2,FILL Value,FILL Value,3


In [12]:
df['A']                    

0    1.0
1    2.0
2    NaN
Name: A, dtype: float64

In [13]:
df['A'].fillna(value=df['A'].mean())    # Fill in NaN value with the mean
                                        # Example: df['A'] has a NaN value
                                        # fillna with mean of column df['A'],
                                        # therefore: df['A'].fillna(value = df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64