# DEALING WITH MISSING VALUES IN DATASET

#### Different ways of handling missing data in a dataset : (1) fillna  (2) dropna (3) interpolate (4) replace function

In [6]:
import pandas as pd
df0 = pd.read_csv('./Data2/weather_data4fill.csv')
df0  # Missing Data are represented by NaN

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


<b> Note the following: </b><br>
(1) fillna : To fill missing values using different ways <br>
(2) dropna : To drop rows with missing values from the Dataset <br>
(3) interpolate : To make a guess of the missing values using interpolation <br>
(4) replace function : To replace the missing values with something else

In [12]:
type(df0.day) # The day column is a series

pandas.core.series.Series

In [13]:
type(df0.day[2]) # The individual date in the day column like 1/9/2017 is seen as a string (str)

str

In [14]:
type(df0.temperature)

pandas.core.series.Series

In [15]:
type(df0.temperature[4])

numpy.float64

In [16]:
type(df0.event)

pandas.core.series.Series

In [17]:
type(df0.event[1])

str

In [18]:
# The summary of all datatypes, Dataframe objects == Python string
df0.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
day            9 non-null object
temperature    5 non-null float64
windspeed      5 non-null float64
event          7 non-null object
dtypes: float64(2), object(2)
memory usage: 416.0+ bytes


In [25]:
# converting the day column to Timestamp
df0 = pd.read_csv('./Data2/weather_data4fill.csv', parse_dates=['day']) 
df0 

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [26]:
# results show that our date has been changed to Timestamp from string
type(df0.day[0])   

pandas._libs.tslibs.timestamps.Timestamp

In [27]:
# make the day column as the new index of the dataset
df0.set_index('day', inplace=True) 

In [28]:
df0

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [29]:
# The summary of the numerical computation of data involved in the Dataset
df0.describe()

Unnamed: 0,temperature,windspeed
count,5.0,5.0
mean,33.2,8.4
std,4.38178,2.302173
min,28.0,6.0
25%,32.0,7.0
50%,32.0,8.0
75%,34.0,9.0
max,40.0,12.0


In [30]:
# Replace NaN with 0 in all locations
df1 = df0.fillna(0)
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [31]:
# Using a Dictionary to pass different values for different columns with missing values.
df1 = df0.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': 'no event'
    
})
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,no event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,no event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [44]:
# Using the forward fill = ffill to provide values for missing values. FILLTHE CELL IN YOUR FRONT
df1 = df0.fillna(method='ffill')
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [45]:
# Using the backward fill = bfill to provide values for missing values. FILLTHE CELL AT YOUR BACK
df1 = df0.fillna(method='bfill')
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.0,Rain
2017-01-08,34.0,8.0,Sunny
2017-01-09,34.0,8.0,Cloudy
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [47]:
# Note: axis only works with one of the methods, it checks and fills missing values sideways. bfill = LEFT
df1 = df0.fillna(method='bfill', axis='columns')
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-04,9,9,Sunny
2017-01-05,28,Snow,Snow
2017-01-06,7,7,
2017-01-07,32,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34,8,Cloudy
2017-01-11,40,12,Sunny


In [48]:
# Note: axis only works with one of the methods, it checks and fills missing values sideways. ffill = RIGHT
df1 = df0.fillna(method='ffill', axis='columns')
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,28.0,Snow
2017-01-06,,7.0,7
2017-01-07,32.0,32.0,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [49]:
# forward fills with LIMITS, to determine the number of cells to fill 
df1 = df0.fillna(method='ffill', limit=1)
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [50]:
# forward fills with LIMITS, to determine the number of cells to fill 
df1 = df0.fillna(method='ffill', limit=2)
df1

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### Interpolation: The process of estimating an unknown value that falls within  known values 

In [51]:
# To make a guess of the missing value using interpolation,DEFAULT METHOD IS LINEAR
df2 = df0.interpolate()
df2

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [52]:
# To make a guess of the missing value using interpolation
df2 = df0.interpolate(method='time')
df2

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### Dropna : Remove all rows with one or more missing values in them

In [54]:
df3 = df0.dropna()
df3

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [55]:
# Note: the (how) is used to determine the exact number of rows to be dropped.
df3 = df0.dropna(how='all')
df3

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [62]:
# Note: the (how) is used to determine the exact number of rows to be dropped.
df3 = df0.dropna(how='any')
df3

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [59]:
# Note: the (thresh) is used to determine the minimum value to allowed in a dataset
df3 = df0.dropna(thresh = 1)
df3

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [60]:
# Note: the (thresh) is used to determine the minimum value to allowed in a dataset
df3 = df0.dropna(thresh = 2)
df3

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [61]:
# Note: the (thresh) is used to determine the minimum value to allowed in a dataset
df3 = df0.dropna(thresh = 3)
df3

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


## Replace Function

In [64]:
import pandas as pd
import numpy as np
df00 = pd.read_csv('./Data2/weather_data.csv')
df00

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [65]:
# Replace a single a number
df01 = df00.replace(-99999, np.NaN)
df01

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,,7.0,Sunny
2,1/3/2017,28.0,,Snow
3,1/4/2017,,7.0,0
4,1/5/2017,32.0,,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,0


In [75]:
# Replace a numbers and strings with a dictionary
df02 = df00.replace({
    'temperature':-99999,
    'windspeed':-99999,
    'event': '0'
    
}, np.NaN)
df02

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,,7.0,Sunny
2,1/3/2017,28.0,,Snow
3,1/4/2017,,7.0,
4,1/5/2017,32.0,,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,


In [76]:
# Using dictionary but differently
df03 = df00.replace({
    -99999: np.NaN,
    '0' : 'Sunny day'
})
df03

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,,7.0,Sunny
2,1/3/2017,28.0,,Snow
3,1/4/2017,,7.0,Sunny day
4,1/5/2017,32.0,,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,Sunny day
