# Dealing with missing data

In [1]:
# Importing the library
import pandas as pd
import numpy as np

#Some functions to deal with missing data
def check_null(data):
    """
    checking null values
    data = data frame
    """
    null_columns = data.columns[data.isnull().any()]
    return data[null_columns].isnull().sum()    

def display_null(data,column):
    """
    data: data frame
    column: column name in str
    """
    bool_series = pd.isnull(data[column])
    return data[bool_series]

def display_not_null(data,column):
    """
    data: data frame
    column: column name in str
    """
    bool_series = pd.notnull(data[column])  
    return data[bool_series]  

def column_is_null(data,column):
    """
    data: data frame
    column: column name in str
    """
    for header in data.columns:
        if header == column:
            sum_of_null = data[column].isna().sum()
            column_null = f"Null values for '{header}': {sum_of_null}"
    print(column_null) 

You can obtain the data here: https://drive.google.com/open?id=1PWvHNhNDpckBHsUCLD1WhJyeBdVk7mlw <br>
The tutorial can be learn from here : https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/

In [2]:
# Loading the data
data = pd.read_csv('/home/afrioni/data_science/employees.csv')
data.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [3]:
# Checking Null Values
check_null(data)

First Name            67
Gender               145
Senior Management     67
Team                  43
dtype: int64

In [4]:
# displaying data only with column name = NaN  
display_null(data,'Team').head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
10,Louise,Female,8/12/1980,9:01 AM,63241,15.132,True,
23,,Male,6/14/2012,4:19 PM,125792,5.042,,
32,,Male,8/21/1998,2:27 PM,122340,6.417,,
91,James,,1/26/2005,11:00 PM,128771,8.309,False,


In [5]:
# displaying data only with column name = not NaN  
display_not_null(data,'Team').head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
5,Dennis,Male,4/18/1987,1:35 AM,115163,10.125,False,Legal


## Filling null values

### Filling null values with single value

In [6]:
# filling a null values using fillna()  
data["Team"].fillna("No Team", inplace = True)

# checking null values for column name
column_is_null(data,'Team')

Null values for 'Team': 0
