# Handling missing values in R


* is.na : gives boolean list whether the given value is NA
* complete.cases: checks for complete cases
* na.omit : fetches records with no missing values

### Advanced methods like multiple imputation can be used that work in three stages

* Bootstrap
* Analyse
* combine

* package name : Amelia

In [26]:
url <- "https://raw.githubusercontent.com/genomicsclass/dagdata/master/inst/extdata/msleep_ggplot2.csv"
filename <- "msleep_ggplot2.csv"

if (!file.exists(filename)) download(url,filename)
    
msleep <- read.csv("msleep_ggplot2.csv")
head(msleep)

name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,,50.0
Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155,0.48
Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,,1.35
Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.1333333,9.1,0.00029,0.019
Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.6666667,20.0,0.423,600.0
Three-toed sloth,Bradypus,herbi,Pilosa,,14.4,2.2,0.7666667,9.6,,3.85


## Check the whole dataset for missing values

In [3]:

head(is.na(msleep))

name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
False,False,False,False,False,False,True,True,False,True,False
False,False,False,False,True,False,False,True,False,False,False
False,False,False,False,False,False,False,True,False,True,False
False,False,False,False,False,False,False,False,False,False,False
False,False,False,False,False,False,False,False,False,False,False
False,False,False,False,True,False,False,False,False,True,False


## Replace missing value NA with mean 

In [30]:
# replace missing values of a brain weight with mean brain weight

msleep$brainwt[is.na(msleep$brainwt)] <- mean(msleep$brainwt, na.rm = TRUE)
head(msleep)


name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,0.2815814,50.0
Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155,0.48
Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,0.2815814,1.35
Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.1333333,9.1,0.00029,0.019
Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.6666667,20.0,0.423,600.0
Three-toed sloth,Bradypus,herbi,Pilosa,,14.4,2.2,0.7666667,9.6,0.2815814,3.85


## check for single missing value

In [35]:
# Check whether there is atleast one null value in the column
any(is.na(msleep$order))

## Get the dataset without records with NA

In [38]:
# complete cases to find rows without any missing columns
head(msleep[complete.cases(msleep),])

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.1333333,9.1,0.00029,0.019
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.6666667,20.0,0.423,600.0
7,Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7,1.4,0.3833333,15.3,0.2815814,20.49
9,Dog,Canis,carni,Carnivora,domesticated,10.1,2.9,0.3333333,13.9,0.07,14.0
12,Guinea pig,Cavis,herbi,Rodentia,domesticated,9.4,0.8,0.2166667,14.6,0.0055,0.728
14,Chinchilla,Chinchilla,herbi,Rodentia,domesticated,12.5,1.5,0.1166667,11.5,0.0064,0.42


## find records with atleast one missing column

In [39]:

head(msleep[!complete.cases(msleep),])

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
1,Cheetah,Acinonyx,carni,Carnivora,lc,12.1,,,11.9,0.2815814,50.0
2,Owl monkey,Aotus,omni,Primates,,17.0,1.8,,7.0,0.0155,0.48
3,Mountain beaver,Aplodontia,herbi,Rodentia,nt,14.4,2.4,,9.6,0.2815814,1.35
6,Three-toed sloth,Bradypus,herbi,Pilosa,,14.4,2.2,0.7666667,9.6,0.2815814,3.85
8,Vesper mouse,Calomys,,Rodentia,,7.0,,,17.0,0.2815814,0.045
10,Roe deer,Capreolus,herbi,Artiodactyla,lc,3.0,,,21.0,0.0982,14.8


## Create a new dataset with just the complete records

In [40]:
# Create a new dataset with just the complete records
new_data = (na.omit(msleep))

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
4,Greater short-tailed shrew,Blarina,omni,Soricomorpha,lc,14.9,2.3,0.1333333,9.1,0.00029,0.019
5,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.6666667,20.0,0.423,600.0
7,Northern fur seal,Callorhinus,carni,Carnivora,vu,8.7,1.4,0.3833333,15.3,0.2815814,20.49
9,Dog,Canis,carni,Carnivora,domesticated,10.1,2.9,0.3333333,13.9,0.07,14.0
12,Guinea pig,Cavis,herbi,Rodentia,domesticated,9.4,0.8,0.2166667,14.6,0.0055,0.728
14,Chinchilla,Chinchilla,herbi,Rodentia,domesticated,12.5,1.5,0.1166667,11.5,0.0064,0.42


## Advanced methods like multiple imputation can be used that work in three stages
* Bootstrap
* Analyse
* combine

### Amelia is one such library that performs multiple imputations

In [42]:
#install.packages("Amelia", repos = "https://cloud.r-project.org/")
library(Amelia)

Installing package into 'C:/Users/Amit/Documents/R/win-library/3.3'
(as 'lib' is unspecified)


package 'Amelia' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Amit\AppData\Local\Temp\RtmpMTOt94\downloaded_packages
