# Pandas Tutorial Day 5

In [37]:
import pandas as pd

## How to handle missing data in dataframes
1. `fillna` to fill missing values using different ways
2. interpolate to make a guess on missing values using interpolation
3. `dropna` to drop rows with missing values

## `fillna` method

In [68]:
df = pd.read_csv("weather_data.csv", parse_dates = ['day'])
df = df.set_index('day')
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [69]:
# filling na vlaues with certain values for each column
# df_new = df.fillna({
#     'temperature' : 0,
#     'windspeed' : 0,
#     'event' : 'no event'
# })
# df_new

In [70]:
# filling na values with the values around that cell in the same column
# df_new = df.ffill() # filling the value above the empty cell
# df_new = df.bfill() # filling the value below the empty cell

# we can specify the direction of filling using the axis command
df_new = df.bfill(axis = 1)

# if we want to carry a value only certain number of time, we can use limit parameter
df_new = df.bfill(limit = 1)

df_new

  df_new = df.bfill(axis = 1)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,34.0,8.0,Cloudy
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


## Interpolation
Now, if we want to interpolate between values so that they are realistic

In [73]:
# df_new = df.interpolate()
# df_new

# if we want to give higher weightage to a certain part during interpolation, we can use `method = time` parameter
df_new = df.interpolate(method = "time")
df_new

  df_new = df.interpolate(method = "time")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


## `dropna()`

In [79]:
# if we want to drop all the rows with na values
# df_new = df.dropna(how = "all")

# if we want to drop rows where number of na is greater than thresh
df_new = df.dropna(thresh=1)

# how to insert missing dates
dt = pd.date_range("01-01-2017", "01-11-2017")
idx = pd.DatetimeIndex(dt)
df_new = df.reindex(idx)
df_new

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
