# Processing missing Data 

In [3]:
import numpy as np
import pandas as pd

In [29]:
#Normally String values columns containing strings that represent missing data are not treated as missing
# We can specify na_values as a list of strings which should be treated as missing. This will now make these cells NaN
# which is automatically done for numeric columns
football = pd.read_csv('football.csv', na_values=['NODATA', 'NOCLUE']) 
football.head(10)

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,,5.0


In [30]:
# by default drops all rows having NaN
football.dropna()

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
2,114.0,2011.0,Bears,8.0,8.0
4,128.0,2012.0,Bears,10.0,6.0
6,142.0,2011.0,Packers,15.0,1.0
8,157.0,2012.0,Packers,11.0,5.0
10,171.0,2010.0,Lions,6.0,10.0
11,185.0,2011.0,Lions,10.0,6.0
12,200.0,2012.0,Lions,4.0,12.0


In [32]:
football = pd.read_csv('football.csv') 
# dropping string columns with missing values
invalid_data_strings = ['NODATA', 'NOCLUE', 'NA'] # Mask

football['team'].isin(invalid_data_strings)

0     False
1     False
2     False
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
Name: team, dtype: bool

In [33]:
# the above expression gives a bool array which we need to invert
football['team'].isin(invalid_data_strings).apply(lambda x : not x)

0      True
1      True
2      True
3     False
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
Name: team, dtype: bool

In [34]:
# select the data based on this bool condition
mask = football['team'].isin(invalid_data_strings).apply(lambda x : not x)
football = football[mask]
football

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,,5.0
10,171.0,2010.0,Lions,6.0,10.0


In [36]:
'''Fill cells with a value'''
# the na_values is necessary to treat particular strings as NaN
football = pd.read_csv('football.csv', na_values=['NODATA', 'NOCLUE'])  
football =football.fillna(value=football.mean()) 
football
# This will automatically replace missing value in a column by taking the mean of the other values in the column
# Note: This will only be done for numeric columns

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,156.916667,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,2011.083333,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,5.833333
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,10.166667,5.0


In [39]:
# although this does not make sense we can fill a particular columns with a constant value
# we can also do this for numeric columns
football['team'] = football['team'].fillna(value='Rockets')
football

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,156.916667,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,Rockets,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,2011.083333,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,5.833333
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,10.166667,5.0


In [69]:
'''Replacing missing values'''
football = pd.read_csv('football.csv')
football.head(10)

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,NODATA,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,,5.0


In [70]:
# replace using list . the to_replace and value list must be of same length
# they will replaced according to index positions. eg: NODATA replaced by Bears, NOCLUE replaced by Rockets
football.replace(to_replace=['NODATA', 'NOCLUE'], value=['Bears', 'Rockets'])

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,Bears,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,,5.0


In [71]:
football.head(10)

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,NODATA,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,,5.0


In [75]:
# Replace using dictionaries. This method is usefull if we want different set of strings to replace in different columns
# in case of multiple string columns

# in team column, replace 'NODATA' with 'Rockets, 'NOCLUE' with 'Bears'
# in game_id column, replace NaN with 111, inf with max value of int32

replace_mapping = {'team': {'NODATA': 'Rockets', 'NOCLUE': 'Bears'},
                   'game_id': { np.NaN: 111, np.inf: np.iinfo(np.int32).max }  } 
football = football.replace(to_replace=replace_mapping)

In [73]:
football = football.fillna(value=football.mean())

In [74]:
football.head(10)

Unnamed: 0,game_id,year,team,wins,losses
0,100.0,2010.0,Bears,11.0,5.0
1,156.916667,2010.0,Rockets,13.0,6.0
2,114.0,2011.0,Bears,8.0,8.0
3,145.0,2012.0,NODATA,9.0,3.0
4,128.0,2012.0,Bears,10.0,6.0
5,167.0,2011.083333,Rockets,12.0,3.0
6,142.0,2011.0,Packers,15.0,1.0
7,187.0,2012.0,Rockets,13.0,5.833333
8,157.0,2012.0,Packers,11.0,5.0
9,187.0,2010.0,Packers,10.166667,5.0


In [76]:
football[['game_id', 'year', 'wins', 'losses']] = football[['game_id', 'year', 'wins', 'losses']].apply(np.ceil).astype(np.int32)
football

Unnamed: 0,game_id,year,team,wins,losses
0,100,2010,Bears,11,5
1,157,2010,Rockets,13,6
2,114,2011,Bears,8,8
3,145,2012,Rockets,9,3
4,128,2012,Bears,10,6
5,167,2012,Rockets,12,3
6,142,2011,Packers,15,1
7,187,2012,Rockets,13,6
8,157,2012,Packers,11,5
9,187,2010,Packers,11,5


In [78]:
football.set_index('game_id')

Unnamed: 0_level_0,year,team,wins,losses
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,2010,Bears,11,5
157,2010,Rockets,13,6
114,2011,Bears,8,8
145,2012,Rockets,9,3
128,2012,Bears,10,6
167,2012,Rockets,12,3
142,2011,Packers,15,1
187,2012,Rockets,13,6
157,2012,Packers,11,5
187,2010,Packers,11,5
