# Handling NaNs

In [70]:
import pandas as pd
import numpy as np

## Setup data

### Load coffee

In [71]:
coffee = pd.read_csv('warmup-data/coffee.csv')
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


### check np.where == df.map

In [72]:
assert pd.Series(
    np.where(
        coffee['Coffee Type']=='Espresso',
        3.99,
        5.99
    )
).equals(
    coffee['Coffee Type'].map(
        {'Espresso': 3.99, 'Latte': 5.99}
    )
)


### create new-price, revenue columns

In [73]:
coffee['new-price'] = coffee['Coffee Type'].map(
    {'Espresso': 3.99, 'Latte': 5.99}
)
coffee['revenue'] = coffee['Units Sold'] * coffee['new-price']
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,25,3.99,99.75
1,Monday,Latte,15,5.99,89.85
2,Tuesday,Espresso,30,3.99,119.7
3,Tuesday,Latte,20,5.99,119.8
4,Wednesday,Espresso,35,3.99,139.65


## create NaN values

In [74]:
coffee.loc[
    [0, 1],         # row indexes
    'Units Sold'    # single column
] = np.nan
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,,3.99,99.75
1,Monday,Latte,,5.99,89.85
2,Tuesday,Espresso,30.0,3.99,119.7
3,Tuesday,Latte,20.0,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65


## check NaN values

In [75]:
coffee.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Day          14 non-null     object 
 1   Coffee Type  14 non-null     object 
 2   Units Sold   12 non-null     float64
 3   new-price    14 non-null     float64
 4   revenue      14 non-null     float64
dtypes: float64(3), object(2)
memory usage: 692.0+ bytes


In [76]:
coffee.isna().head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,False,False,True,False,False
1,False,False,True,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False


In [77]:
coffee.isna().sum()

Day            0
Coffee Type    0
Units Sold     2
new-price      0
revenue        0
dtype: int64

## fill NaN values

In [78]:
coffee.fillna(
    coffee['Units Sold'].mean()
).head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,35.0,3.99,99.75
1,Monday,Latte,35.0,5.99,89.85
2,Tuesday,Espresso,30.0,3.99,119.7
3,Tuesday,Latte,20.0,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65


### interpolate needs neighbours

#### this does not work, no neighbours because first rows

In [79]:
coffee.fillna(
    coffee['Units Sold'].interpolate()
).head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,,3.99,99.75
1,Monday,Latte,,5.99,89.85
2,Tuesday,Espresso,30.0,3.99,119.7
3,Tuesday,Latte,20.0,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65


In [80]:
coffee[coffee['Coffee Type'] == 'Espresso']['Units Sold'].mean()
coffee.loc[[0,1], 'Units Sold'] = coffee[coffee['Coffee Type'] == 'Espresso']['Units Sold'].mean()
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
2,Tuesday,Espresso,30.0,3.99,119.7
3,Tuesday,Latte,20.0,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65


In [81]:
coffee.loc[[2, 3], 'Units Sold'] = np.nan
coffee.head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
2,Tuesday,Espresso,,3.99,119.7
3,Tuesday,Latte,,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65


In [82]:
coffee.fillna(
    coffee['Units Sold'].mean()
).head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
2,Tuesday,Espresso,37.5,3.99,119.7
3,Tuesday,Latte,37.5,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65


## dropna

In [83]:
coffee.loc[[2, 3], 'Units Sold'] = np.nan
print(coffee.shape)
coffee.head()

(14, 5)


Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
2,Tuesday,Espresso,,3.99,119.7
3,Tuesday,Latte,,5.99,119.8
4,Wednesday,Espresso,35.0,3.99,139.65


In [84]:
coffee = coffee.dropna(
    subset='Units Sold',    # label or list of labels
    # axis=0,               # 0 rows, 1 columns: default 0
    # inplace=True,         # inplace edit or return df, default False
)
print(coffee.shape)
coffee.head()

(12, 5)


Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
4,Wednesday,Espresso,35.0,3.99,139.65
5,Wednesday,Latte,25.0,5.99,149.75
6,Thursday,Espresso,40.0,3.99,159.6


In [87]:
coffee[coffee['Units Sold'] > 20].head()

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
4,Wednesday,Espresso,35.0,3.99,139.65
5,Wednesday,Latte,25.0,5.99,149.75
6,Thursday,Espresso,40.0,3.99,159.6


In [88]:
coffee.loc[coffee['Units Sold'] > 20]

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
4,Wednesday,Espresso,35.0,3.99,139.65
5,Wednesday,Latte,25.0,5.99,149.75
6,Thursday,Espresso,40.0,3.99,159.6
7,Thursday,Latte,30.0,5.99,179.7
8,Friday,Espresso,45.0,3.99,179.55
9,Friday,Latte,35.0,5.99,209.65
10,Saturday,Espresso,45.0,3.99,179.55
11,Saturday,Latte,35.0,5.99,209.65


## get na rows

In [None]:
coffee.reset_index(drop=True, inplace=True)

In [95]:
coffee.loc[[2, 3], 'Units Sold'] = np.nan
print(coffee.shape)
coffee.head()

(12, 5)


Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
2,Wednesday,Espresso,,3.99,139.65
3,Wednesday,Latte,,5.99,149.75
4,Thursday,Espresso,40.0,3.99,159.6


In [97]:
coffee[coffee['Units Sold'].isna()]

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
2,Wednesday,Espresso,,3.99,139.65
3,Wednesday,Latte,,5.99,149.75


## get non na rows

In [98]:
coffee[coffee['Units Sold'].notna()]

Unnamed: 0,Day,Coffee Type,Units Sold,new-price,revenue
0,Monday,Espresso,40.0,3.99,99.75
1,Monday,Latte,40.0,5.99,89.85
4,Thursday,Espresso,40.0,3.99,159.6
5,Thursday,Latte,30.0,5.99,179.7
6,Friday,Espresso,45.0,3.99,179.55
7,Friday,Latte,35.0,5.99,209.65
8,Saturday,Espresso,45.0,3.99,179.55
9,Saturday,Latte,35.0,5.99,209.65
10,Sunday,Espresso,45.0,3.99,179.55
11,Sunday,Latte,35.0,5.99,209.65
