# How to deal with missing data 

In [1]:
# print all the outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import warnings
warnings.filterwarnings('ignore')

Load the data

In [3]:
import pandas as pd
df = pd.read_csv("winemag-data-130k.csv", index_col=0)

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
pd.set_option('display.max_rows', 20)

In [6]:
df

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss


In [7]:
# check the length of the dataset
len(df)

129971

In [8]:
# check the shape of the dataset
df.shape

(129971, 13)

In [9]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Missing data

Entries missing values are given the value **NaN**, short for "Not a Number". For technical reasons these NaN values are always of the float64 dtype.

*pandas* provides some methods specific to missing data. To select NaN entreis you can use **isna()**( or **isnull()**) (or its companion notna()(notnull()).

<b>isna()</b>: Return a boolean same-sized object indicating if the values are NA

In [10]:
# check whether the value is missing
df.isna()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,False,False,False,False,True,False,False,True,False,False,False,False,False
1,False,False,False,False,False,False,True,True,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,False,False,False,False,False,False,True,True,False,True,False,False,False
129967,False,False,True,False,False,False,False,False,False,False,False,False,False
129968,False,False,False,False,False,False,False,True,False,False,False,False,False
129969,False,False,True,False,False,False,False,True,False,False,False,False,False


Use **.any()** to return whether any element is True over requested axis

In [11]:
df.isna().any()

country                   True
description              False
designation               True
points                   False
price                     True
province                  True
region_1                  True
region_2                  True
taster_name               True
taster_twitter_handle     True
title                    False
variety                   True
winery                   False
dtype: bool

In [12]:
# apply any() to row-wise by changing axis to 1
df.isna().any(axis=1)

0          True
1          True
2          True
3          True
4         False
          ...  
129966     True
129967     True
129968     True
129969     True
129970     True
Length: 129971, dtype: bool

use **.sum()** to get the sum of the Nan values for the requested axis

In [13]:
# Check the number of missing value in each column
df.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

use **.sum().sum()** to get the total Nan in dataframe

In [14]:
# check the number of total missing value 
df.isna().sum().sum()

204752

## How to include Nan in .groupby ?

Include Nan value, find all the *taster_twitter_handle* and sort them ascending. 

In [15]:
# check number of nan in taster_twitter_handle column
df.taster_twitter_handle.isna().sum()

31213

In [16]:
# check the unique value in taster_twitter_handle column
df.taster_twitter_handle.unique()

array(['@kerinokeefe', '@vossroger', '@paulgwine\xa0', nan, '@wineschach',
       '@vboone', '@mattkettmann', '@wawinereport', '@gordone_cellars',
       '@JoeCz', '@AnneInVino', '@laurbuzz', '@worldwineguys',
       '@suskostrzewa', '@bkfiona', '@winewchristina'], dtype=object)

In [17]:
# check the number of unique value using nunique() in taster_twitter_handle column
df.taster_twitter_handle.nunique()

15

In [18]:
# we can overwrite dropna to False in nunique() to count the unique value include nan
df.taster_twitter_handle.nunique(dropna=False)

16

In [19]:
# By default groupby do not include nan group
df.groupby('taster_twitter_handle')['taster_twitter_handle'].count().sort_values()

taster_twitter_handle
@winewchristina         6
@bkfiona               27
@worldwineguys       1005
@suskostrzewa        1085
@laurbuzz            1835
@AnneInVino          3685
@gordone_cellars     4177
@wawinereport        4966
@JoeCz               5147
@mattkettmann        6332
@paulgwine           9532
@vboone              9537
@kerinokeefe        10776
@wineschach         15134
@vossroger          25514
Name: taster_twitter_handle, dtype: int64

Nan groups in GroupBy are automatically excluded ... if need to keep Nan as a group, use .astype(str)...

In [20]:
# use astype to change the np.nan into nan string, then nan can be a group in groupby
df.astype(str).groupby('taster_twitter_handle')['taster_twitter_handle'].count().sort_values()

taster_twitter_handle
@winewchristina         6
@bkfiona               27
@worldwineguys       1005
@suskostrzewa        1085
@laurbuzz            1835
@AnneInVino          3685
@gordone_cellars     4177
@wawinereport        4966
@JoeCz               5147
@mattkettmann        6332
@paulgwine           9532
@vboone              9537
@kerinokeefe        10776
@wineschach         15134
@vossroger          25514
nan                 31213
Name: taster_twitter_handle, dtype: int64

## How to deal with Nan?

## fillna()

Replacing missing values is a common operation.  *pandas* provides a really handy method for this problem: **fillna()**. fillna provides a few different strategies for mitigating such data. 

### Example 1, replace region_1 each NaN with an  "Unknown":

replace NaN in region_1 with "Unknown"

In [21]:
df.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [22]:
# replace np.nan with "Unknown" with and change in permentaly
df.region_1.fillna(value="Unknown", inplace=True)

In [23]:
df.region_1

0                        Etna
1                     Unknown
2           Willamette Valley
3         Lake Michigan Shore
4           Willamette Valley
                 ...         
129966                Unknown
129967                 Oregon
129968                 Alsace
129969                 Alsace
129970                 Alsace
Name: region_1, Length: 129971, dtype: object

### Example 2, replace the NaN in 'price' with price's average:

In [24]:
# get the average price
price_avg=df.price.mean()

In [25]:
price_avg

35.363389129985535

In [26]:
# replace nan with price_avg
df.price.fillna(value=price_avg,inplace=True)

In [27]:
# prove that there is no nan in 'price' 
df.price.isna().any()

False

In [28]:
df.price.mean()

35.363389129985535

## dropna()

In [29]:
# make a copy()
df2=df.copy()

In [30]:
# check the shape of tht table before we drop anything
df.shape

(129971, 13)

In [31]:
# drop all nan from df2
df2 = df2.dropna()

In [32]:
# check the shape of tht table after we drop all nan
df2.shape

(22524, 13)

In [33]:
# show that there in no nan in all columns
df2.isna().any()

country                  False
description              False
designation              False
points                   False
price                    False
province                 False
region_1                 False
region_2                 False
taster_name              False
taster_twitter_handle    False
title                    False
variety                  False
winery                   False
dtype: bool

The above operations dropped 83% of data, not a good idea ..

### Example 3, drop the rows with country = NaN :

In [34]:
df.shape

(129971, 13)

In [35]:
df.isna().sum()

country                     63
description                  0
designation              37465
points                       0
price                        0
province                    63
region_1                     0
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [36]:
# There are only very small proportion of missing value in country column
df.country.isna().sum()

63

Let's figure out where is the first country with country with NaN

In [37]:
# get the slice that country vlaue is missing
df[df.country.isna()==True].head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
913,,"Amber in color, this wine has aromas of peach ...",Asureti Valley,87,30.0,,Unknown,,Mike DeSimone,@worldwineguys,Gotsa Family Wines 2014 Asureti Valley Chinuri,Chinuri,Gotsa Family Wines


In [38]:
# use subset to specify the columns 
df.dropna(how='any',subset=['country'])[912:915]

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
912,US,"This light, refreshing rosé mixes fresh strawb...",Three Otters Pinot Noir,87,18.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Fullerton 2015 Three Otters Pinot Noir Rosé (W...,Rosé,Fullerton
914,Argentina,Smashed berry aromas are backed by earth and t...,Grand Reserve,87,20.0,Other,San Juan,,Michael Schachner,@wineschach,Graffigna 2012 Grand Reserve Malbec (San Juan),Malbec,Graffigna
915,US,"This is smooth and accessible, putting a light...",Atração,87,25.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Coelho 2014 Atração Pinot Noir (Willamette Val...,Pinot Noir,Coelho


In [39]:
# overwrite inplace parameter to change the reuslt permanently
df.dropna(how='any',subset=['country'],inplace=True)

In [40]:
# the number of row derease from 129971 to 129908, which is acceptable.
df.shape

(129908, 13)

In [41]:
# check the number of missing value in country column
df.country.isna().any()

False

### Example 4, drop based on threshold(number of non-NaN)

Drop **column(s)** has more than 50% NaN. (*require at least len(df)/2 ~ 65,000 non-NaN*)

In [42]:
df.shape

(129908, 13)

In [43]:
df.isna().sum()

country                      0
description                  0
designation              37454
points                       0
price                        0
province                     0
region_1                     0
region_2                 79397
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [44]:
# number of 50% rows?
thresh_50Percent = len(df)/2
thresh_50Percent

64954.0

In [45]:
# Since we want to drop columns, we need to overwrite axis to 1
df.dropna(thresh=thresh_50Percent, axis=1, inplace=True)

In [46]:
df.shape

(129908, 12)

In [47]:
df.isna().sum()

country                      0
description                  0
designation              37454
points                       0
price                        0
province                     0
region_1                     0
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

Above operations cause region_2 got dropped.

In [48]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,35.363389,Sicily & Sardinia,Etna,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Unknown,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## backfill/ffill 

Or we could fill each NaN with the first non-NaN value that appears sometime after/before the given record in the database. This is known as the backfill/ffill strategy:

Fill the NaN in 'taster_name' with the first non-null value that appears after the given record.

In [49]:
df.taster_name.isna().sum()

26244

In [50]:
df[df.taster_name.isna()==True].head()

Unnamed: 0,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
31,Italy,Merlot and Nero d'Avola form the base for this...,Calanìca Nero d'Avola-Merlot,86,35.363389,Sicily & Sardinia,Sicilia,,,Duca di Salaparuta 2010 Calanìca Nero d'Avola-...,Red Blend,Duca di Salaparuta
32,Italy,"Part of the extended Calanìca series, this Gri...",Calanìca Grillo-Viognier,86,35.363389,Sicily & Sardinia,Sicilia,,,Duca di Salaparuta 2011 Calanìca Grillo-Viogni...,White Blend,Duca di Salaparuta
33,US,"Rustic and dry, this has flavors of berries, c...",Puma Springs Vineyard,86,50.0,California,Dry Creek Valley,,,Envolve 2010 Puma Springs Vineyard Red (Dry Cr...,Red Blend,Envolve
34,US,"This shows a tart, green gooseberry flavor tha...",,86,20.0,California,Sonoma Valley,,,Envolve 2011 Sauvignon Blanc (Sonoma Valley),Sauvignon Blanc,Envolve
37,Italy,This concentrated Cabernet offers aromas of cu...,Missoni,86,21.0,Sicily & Sardinia,Sicilia,,,Feudi del Pisciotto 2010 Missoni Cabernet Sauv...,Cabernet Sauvignon,Feudi del Pisciotto


In [51]:
df.taster_name.iloc[30:36]

30      Roger Voss
31             NaN
32             NaN
33             NaN
34             NaN
35    Paul Gregutt
Name: taster_name, dtype: object

### method='backfill'

In [52]:
df.taster_name.fillna(method='backfill').iloc[30:36]

30      Roger Voss
31    Paul Gregutt
32    Paul Gregutt
33    Paul Gregutt
34    Paul Gregutt
35    Paul Gregutt
Name: taster_name, dtype: object

### method='ffill'

In [53]:
df.taster_name.fillna(method='ffill').iloc[30:36]

30      Roger Voss
31      Roger Voss
32      Roger Voss
33      Roger Voss
34      Roger Voss
35    Paul Gregutt
Name: taster_name, dtype: object

In [54]:
df.taster_name.fillna(method='ffill',inplace=True)

In [55]:
df.taster_name.isna().any()

False

In [56]:
df.isna().sum()

country                      0
description                  0
designation              37454
points                       0
price                        0
province                     0
region_1                     0
taster_name                  0
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

## Problems:

We want to clean up the rest of this data set based on following guidelines:

1, change all the NaN in 'taster_twitter_handle' to "@anonymous". 

2, change all the NaN in 'designation' to 'Unknown'.

3, drop the row with 'variety' = NaN

4, since this dataset was published, reviewer Kerin O'Keefe has changed her Twitter handle from @kerinokeefe to @kerino. 

In [57]:
df.isna().sum()

country                      0
description                  0
designation              37454
points                       0
price                        0
province                     0
region_1                     0
taster_name                  0
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [58]:
# change all the NaN in 'designation' to 'Unknown'.
df.designation.fillna(value='Unknown',inplace=True)

verify the 'Unknown' count equal to previous 'designation' count

In [59]:
(df.designation=='Unknown').sum()

37454

In [60]:
# change all the NaN in 'taster_twitter_handle' to "@anonymous". 
df.taster_twitter_handle.fillna(value='@anonymous',inplace=True)

verify the '@anonymous' count equal to previous 'taster_twitter_handle' count

In [61]:
(df.taster_twitter_handle=='@anonymous').sum()

31213

In [62]:
# 3, drop the row with 'variety' = NaN
df.dropna(how='any',subset=['variety'],inplace=True)

verify no NaN in 'variety'

In [63]:
df.variety.isna().any()

False

In [64]:
df.taster_twitter_handle

0         @kerinokeefe
1           @vossroger
2          @paulgwine 
3           @anonymous
4          @paulgwine 
              ...     
129966      @anonymous
129967     @paulgwine 
129968      @vossroger
129969      @vossroger
129970      @vossroger
Name: taster_twitter_handle, Length: 129907, dtype: object

In [65]:
# 4, since this dataset was published, 
# reviewer Kerin O'Keefe has changed her Twitter handle from @kerinokeefe to @kerino. 
df.taster_twitter_handle.replace(to_replace='@kerinokeefe',value='@kerino',inplace=True)

In [66]:
df.isna().sum()

country                  0
description              0
designation              0
points                   0
price                    0
province                 0
region_1                 0
taster_name              0
taster_twitter_handle    0
title                    0
variety                  0
winery                   0
dtype: int64

Final DataFrame shape

In [67]:
df.shape

(129907, 12)

## Note: How to read Microsoft Excel format file

Find the top 3 correlations based on all the data in this excel file.

In [68]:
xl =pd.ExcelFile('Cancer_Cardio.xlsx')

In [69]:
type(xl)
xl

pandas.io.excel._base.ExcelFile

<pandas.io.excel._base.ExcelFile at 0x174d90ef0>

In [70]:
xl.sheet_names

['Cancer', 'Cardio', 'Smoking']

In [71]:
df1 = xl.parse("Cancer")
df1

Unnamed: 0,city,Geocode,cancer
0,Detroit,505,44
1,Ft. Wayne,731,37
2,Pittsburgh,600,50
3,Detroit,507,46
4,Pittsburgh,621,54
5,Ft. Wayne,728,41


In [72]:
df2 = xl.parse("Cardio")
df2

Unnamed: 0,Geocode,cardiovascular
0,505,21
1,731,19
2,600,28
3,507,25
4,621,25
5,728,30


In [73]:
df3 = xl.parse("Smoking")
df3

Unnamed: 0,Geocode,Smoking rate
0,505,25
1,731,31
2,600,33
3,507,27
4,621,34
5,728,36


In [74]:
df=df1.merge(df2,on='Geocode')

In [75]:
df

Unnamed: 0,city,Geocode,cancer,cardiovascular
0,Detroit,505,44,21
1,Ft. Wayne,731,37,19
2,Pittsburgh,600,50,28
3,Detroit,507,46,25
4,Pittsburgh,621,54,25
5,Ft. Wayne,728,41,30


In [76]:
df = df.merge(df3,on='Geocode')

In [77]:
df

Unnamed: 0,city,Geocode,cancer,cardiovascular,Smoking rate
0,Detroit,505,44,21,25
1,Ft. Wayne,731,37,19,31
2,Pittsburgh,600,50,28,33
3,Detroit,507,46,25,27
4,Pittsburgh,621,54,25,34
5,Ft. Wayne,728,41,30,36


Now we can do some analysis.

In [78]:
df.groupby('city')['cancer'].sum().sort_values(ascending=False)

city
Pittsburgh    104
Detroit        90
Ft. Wayne      78
Name: cancer, dtype: int64

In [79]:
df.corr(numeric_only=True)

Unnamed: 0,Geocode,cancer,cardiovascular,Smoking rate
Geocode,1.0,-0.456196,0.127813,0.773534
cancer,-0.456196,1.0,0.377006,0.146327
cardiovascular,0.127813,0.377006,1.0,0.627599
Smoking rate,0.773534,0.146327,0.627599,1.0


In [80]:
cor=df.corr(numeric_only=True)

In [81]:
cor[cor<1].stack()

Geocode         cancer           -0.456196
                cardiovascular    0.127813
                Smoking rate      0.773534
cancer          Geocode          -0.456196
                cardiovascular    0.377006
                Smoking rate      0.146327
cardiovascular  Geocode           0.127813
                cancer            0.377006
                Smoking rate      0.627599
Smoking rate    Geocode           0.773534
                cancer            0.146327
                cardiovascular    0.627599
dtype: float64

In [82]:
cor[cor<1].stack().nlargest(6)[::2]

Geocode         Smoking rate      0.773534
cardiovascular  Smoking rate      0.627599
cancer          cardiovascular    0.377006
dtype: float64