In [2]:
import pandas as pd
import numpy as np

## Handling missing data

In [11]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'], index=[3,2,1,0])
string_data

3     aardvark
2    artichoke
1          NaN
0      avocado
dtype: object

In [12]:
string_data.isnull()

3    False
2    False
1     True
0    False
dtype: bool

In [13]:
string_data.notnull()

3     True
2     True
1    False
0     True
dtype: bool

In [14]:
string_data.dropna()

3     aardvark
2    artichoke
0      avocado
dtype: object

In [15]:
string_data.fillna('teste')

3     aardvark
2    artichoke
1        teste
0      avocado
dtype: object

In [16]:
string_data.fillna(method='ffill')

3     aardvark
2    artichoke
1    artichoke
0      avocado
dtype: object

In [17]:
string_data.fillna(method='bfill')

3     aardvark
2    artichoke
1      avocado
0      avocado
dtype: object

## Filtering out missing data

In [19]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [21]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [23]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [24]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [28]:
data.dropna(thresh=3) # deve possuir pelo menos 3 not nan

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,


In [30]:
data.dropna(axis=1, thresh=2)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


## Filling in missing data

In [31]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [32]:
data.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [33]:
data.fillna({0:0, 1:1, 2:2, 4:4})

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,4.0
1,1.0,1.0,2.0,4.0
2,0.0,1.0,2.0,4.0
3,0.0,6.5,3.0,4.0


In [34]:
data.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,6.5,3.0,
2,1.0,,,
3,,6.5,3.0,


In [35]:
data.fillna(method='bfill', limit=1)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,6.5,3.0,
3,,6.5,3.0,


In [36]:
data.mean()

0    1.0
1    6.5
2    3.0
4    NaN
dtype: float64

In [37]:
data.fillna(data.mean())

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,6.5,3.0,
2,1.0,6.5,3.0,
3,1.0,6.5,3.0,


## Hierarchical indexing

In [39]:
data = pd.Series(np.random.randn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                                             [1,2,3,1,2,3,1,2,2,3]])
data

a  1   -0.717753
   2   -1.893740
   3   -0.757709
b  1   -0.558696
   2    0.715780
   3    0.818767
c  1    0.118824
   2    0.100515
d  2    1.318803
   3    0.487471
dtype: float64

In [40]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [41]:
data['b']

1   -0.558696
2    0.715780
3    0.818767
dtype: float64

In [42]:
data['b':'c']

b  1   -0.558696
   2    0.715780
   3    0.818767
c  1    0.118824
   2    0.100515
dtype: float64

In [43]:
data.loc['b':'c']

b  1   -0.558696
   2    0.715780
   3    0.818767
c  1    0.118824
   2    0.100515
dtype: float64

In [44]:
data.loc[:, 2]

a   -1.893740
b    0.715780
c    0.100515
d    1.318803
dtype: float64

In [45]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.717753,-1.89374,-0.757709
b,-0.558696,0.71578,0.818767
c,0.118824,0.100515,
d,,1.318803,0.487471


In [46]:
data.unstack().stack()

a  1   -0.717753
   2   -1.893740
   3   -0.757709
b  1   -0.558696
   2    0.715780
   3    0.818767
c  1    0.118824
   2    0.100515
d  2    1.318803
   3    0.487471
dtype: float64

In [48]:
data.unstack(level=0)

Unnamed: 0,a,b,c,d
1,-0.717753,-0.558696,0.118824,
2,-1.89374,0.71578,0.100515,1.318803
3,-0.757709,0.818767,,0.487471


In [49]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)), index=[['a', 'a', 'b', 'b'], [1,2,1,2]],
                     columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [50]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [51]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [52]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [53]:
frame.sortlevel(1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [55]:
frame.swaplevel(0, 1).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [56]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [57]:
frame.sum(level='key1')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [58]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


## Using a dataframe's columns

In [59]:
frame = pd.DataFrame({'a': range(7), 'b': range(7,0,-1),
                      'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                      'd': [0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [64]:
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [65]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [66]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
