## <font color="maroon"><h4 align="center">Handling Missing Data - fillna, interpolate, dropna</font>

In [2]:
import pandas as pd
df = pd.read_csv("weather_data.csv",parse_dates=['day'])
type(df.temperature[0])

numpy.float64

In [4]:
df.set_index('day',inplace=True)
df

KeyError: "None of ['day'] are in the columns"

## <font color="blue">fillna</font>

<font color="purple">**Fill all NaN with one specific value**</font>

In [5]:
new_df = df.fillna(0)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="purple">**Fill na using column names and dict**</font>

In [6]:
new_df = df.fillna({
        'temperature': df.temperature.mean(),
        'windspeed': df.windspeed.mean(),
        'event': 'No Event'
    })
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.22,9.0,Sunny
2017-01-05,28.0,8.42,Snow
2017-01-06,33.22,7.0,No Event
2017-01-07,32.0,8.42,Rain
2017-01-08,33.22,8.42,Sunny
2017-01-09,33.22,8.42,No Event
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="purple">**Use method to determine how to fill na values**</font>

In [7]:
new_df = df.fillna(method="ffill")
new_df

  new_df = df.fillna(method="ffill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [8]:
new_df = df.fillna(method="bfill")
new_df

  new_df = df.fillna(method="bfill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.1,Rain
2017-01-08,34.1,8.1,Sunny
2017-01-09,34.1,8.1,Cloudy
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="purple">**Use of axis**</font>

In [9]:
new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
new_df

  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,9.0,9.0,Sunny
2017-01-05,28.0,Snow,Snow
2017-01-06,7.0,7.0,
2017-01-07,32.0,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="purple">**limit parameter**</font>

In [10]:
new_df = df.fillna(method="ffill",limit=1)
new_df

  new_df = df.fillna(method="ffill",limit=1)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


### <font color="blue">interpolate</font>

In [11]:
new_df = df.interpolate()
new_df

  new_df = df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.275,Rain
2017-01-08,32.7,7.55,Sunny
2017-01-09,33.4,7.825,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


### <font color="blue">dropna</font>

In [12]:
new_df = df.dropna()
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [13]:
new_df = df.dropna(how='all')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [14]:
new_df = df.dropna(thresh=2)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny
