#### Handling Missing Data

In [1]:
import numpy as np

In [2]:
d = np.array([1,2,3,None])

In [3]:
np.sum(d)

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [4]:
d = np.array([1,np.nan,2])

In [5]:
np.sum(d)

nan

In [6]:
np.nansum(d)

3.0

In [7]:
import pandas as pd

In [8]:
data = pd.Series([1,2,np.nan,4,5])

In [9]:
data.isnull()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [10]:
data.notnull()

0     True
1     True
2    False
3     True
4     True
dtype: bool

In [13]:
data[data.notnull()]

0    1.0
1    2.0
3    4.0
4    5.0
dtype: float64

In [14]:
## Passing boolean array to a Series returns only the elements for which it is set to true

In [16]:
data.dropna()

0    1.0
1    2.0
3    4.0
4    5.0
dtype: float64

In [17]:
data = pd.DataFrame([ [1,2,np.nan],
               [np.nan,3,1],
               [5,6,7]])

In [19]:
data.dropna()

Unnamed: 0,0,1,2
2,5.0,6,7.0


In [20]:
data

Unnamed: 0,0,1,2
0,1.0,2,
1,,3,1.0
2,5.0,6,7.0


In [21]:
data.dropna(axis=1) # removes by column

Unnamed: 0,1
0,2
1,3
2,6


In [22]:
data.dropna(axis='columns')

Unnamed: 0,1
0,2
1,3
2,6


In [23]:
data

Unnamed: 0,0,1,2
0,1.0,2,
1,,3,1.0
2,5.0,6,7.0


In [24]:
data[3] = np.nan

In [25]:
data

Unnamed: 0,0,1,2,3
0,1.0,2,,
1,,3,1.0,
2,5.0,6,7.0,


In [28]:
data.dropna(thresh=3) #min non-null value

Unnamed: 0,0,1,2,3
2,5.0,6,7.0,


#### Filling null values

In [29]:
data = pd.Series([1,2,3,np.nan, 4,None])

In [31]:
d = data.fillna(0)

In [32]:
np.sum(d)

10.0

In [33]:
data = pd.DataFrame([ [1,2,np.nan],
               [np.nan,3,1],
               [5,6,7]])

In [34]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,2,0.0
1,0.0,3,1.0
2,5.0,6,7.0


In [35]:
data.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,2,
1,1.0,3,1.0
2,5.0,6,7.0


In [38]:
data.fillna(method='ffill',axis=1)

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,,3.0,1.0
2,5.0,6.0,7.0


In [39]:
data

Unnamed: 0,0,1,2
0,1.0,2,
1,,3,1.0
2,5.0,6,7.0


In [40]:
data  = data.fillna(method='ffill',axis=1)

In [41]:
data

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,,3.0,1.0
2,5.0,6.0,7.0


In [42]:
data.fillna(method='bfill',axis=1)

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,3.0,3.0,1.0
2,5.0,6.0,7.0


In [44]:
d = pd.Series([1,2,np.nan,np.nan])

In [45]:
d.fillna(method='ffill')

0    1.0
1    2.0
2    2.0
3    2.0
dtype: float64

## Hierarchical Indexing
Keys can be tuple

In [1]:
import pandas as pd
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]

popl = pd.Series(populations, index=index)

In [2]:
popl

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [51]:
popl[('New York',2000):('Texas',2010)]

(New York, 2000)    18976457
(New York, 2010)    19378102
(Texas, 2000)       20851820
(Texas, 2010)       25145561
dtype: int64

In [52]:
[i for i in range(10)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [54]:
l = [i for i in popl.index if i[1] == 2010]

In [56]:
popl[l]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

#### Pandas MultiIndex

In [4]:
idx = pd.MultiIndex.from_tuples(index)
# Creates index information
print(idx)

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])


In [6]:
final = popl.reindex(idx)

In [7]:
final[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [64]:
final['Texas']

2000    20851820
2010    25145561
dtype: int64

In [8]:
final

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [9]:
df = final.unstack()# converting row info to col info
df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [71]:
df.loc['Texas']

2000    20851820
2010    25145561
Name: Texas, dtype: int64

In [76]:
pop_df = pd.DataFrame({'total': final,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})


In [77]:
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [78]:
f_u18 = pop_df['under18'] / pop_df['total']

In [82]:
d = f_u18.unstack()

In [81]:
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [84]:
d.stack()

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [87]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 3]],
                  columns=['data1', 'data2'])

In [88]:
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.13532,0.800065
a,2,0.929209,0.297294
b,1,0.433965,0.856967
b,3,0.49424,0.835004


In [90]:
idx = pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 3]])

In [91]:
df = pd.DataFrame(np.random.rand(4, 2), index=idx)

In [92]:
df

Unnamed: 0,Unnamed: 1,0,1
a,1,0.31004,0.517741
a,2,0.123645,0.439505
b,1,0.876273,0.951409
b,3,0.477135,0.565913


In [96]:
idx = pd.MultiIndex.from_product([ ['a','b'], [1,2]])

In [101]:
df = pd.DataFrame(np.random.rand(4, 2), index=idx, columns=['a','b'])

In [102]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
state,category,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.207309,0.405784
a,2,0.106656,0.202791
b,1,0.755776,0.292545
b,2,0.819175,0.994821


In [103]:
df.index.names = ['state','category']

In [104]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
state,category,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.207309,0.405784
a,2,0.106656,0.202791
b,1,0.755776,0.292545
b,2,0.819175,0.994821


In [105]:
df['a']

state  category
a      1           0.207309
       2           0.106656
b      1           0.755776
       2           0.819175
Name: a, dtype: float64

In [107]:
df.loc['a']

Unnamed: 0_level_0,a,b
category,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.207309,0.405784
2,0.106656,0.202791


In [108]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
state,category,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0.207309,0.405784
a,2,0.106656,0.202791
b,1,0.755776,0.292545
b,2,0.819175,0.994821


In [111]:
df.loc['a',1]

a    0.207309
b    0.405784
Name: (a, 1), dtype: float64