# Pandas Notes
Notes on how to use Pandas more fluently.

In [17]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [3]:
# Series --> an array-like object with data nd labels
obj = Series([4,7,-5,3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
# get values of a series using 
obj.values

array([ 4,  7, -5,  3])

In [6]:
# get indices of a series using
obj.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
# often, want to create a series id'ing each data point
obj2 = Series([4,7,-5,3], index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2['a']

-5

In [13]:
obj2['d'] = 6
obj2[['c','a','d']]

c    3
a   -5
d    6
dtype: int64

In [14]:
# can NumPy array operations on a series, preserving index ranks
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [15]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [18]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [19]:
# can transform a dict to a series
sdata = {'Ohio':35000, 'Texas':71000, 
         'Oregon': 16000, 'Utah':5000}

In [21]:
obj3 = Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [23]:
# if you only pass in a dict, the index of the
# resulting Series will have the dict's keys in sorted order
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [24]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [25]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [26]:
# Series automatically aligns differentlyindex data
# in arithmetic operations
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [29]:
# both the Series object itself and its index have
# a name attribute, which integrates with 
# other key areas of pandas functionality 
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

In [31]:
# DataFrame! 
# tabluar, spreadsheet-like data structure containing
# an ordered collection of columns,
# each with a different value type
# Has both a row and column index
# consider it a dict of Series
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [33]:
# can build a new data frame, indexing an old one
DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [36]:
frame2 = DataFrame(data, 
                   columns=['year','state','pop','debt'],
                   index=['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [37]:
# retrieve data from a DataFrame
# by default, we grab columns and get a series
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [39]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [42]:
# grab rows by index, and return a Series
frame2.ix['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [44]:
# modify columns by assignment
# send in a scalar value, or an array of values
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [46]:
frame2['debt'] = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [47]:
# when assigning lists or arrays to a column
# the values length must match the length of the dataframe
# if you assign a series, it will instead be conformed 
# exactly the DataFrames index, inserting missing values
# in any holes
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [48]:
frame2['debt'] = val

In [49]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [51]:
# assigning a column that doesn't exist will create a new col
# the del keyword will delete columns,
# as with a dict
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [52]:
del frame2['eastern']

In [53]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [56]:
# nested dicts also work
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
        'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = DataFrame(pop)

In [57]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [58]:
# can transpose 
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [62]:
pdata = {'Ohio':frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [66]:
# can name the index and columns attributes
frame3.index.name = 'year'; frame3.columns.name = 'state'

In [67]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [68]:
# access a dataframes values as an ndarray with
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [69]:
# arrays can have diff types
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

In [72]:
# column values are index objects
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0], index=index)
obj2.index is index

True

In [73]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [74]:
'Ohio' in frame3.columns

True

In [75]:
2003 in frame3.index

False

In [76]:
# index methods and properties
# append
# diff
# intersection
# union
# isin
# delete
# drop
# insert
# is_monotonic
# is_unique
# unique

In [77]:
# Essential Functionality
## Reindex
obj = Series([4.5,7.2,-5.3,3.6], 
            index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [78]:
obj2 = obj.reindex(['a','b','c','d','e'])

In [79]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [80]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [84]:
# fill data
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [85]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [86]:
frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])

In [87]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [88]:
 frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [89]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [90]:
states = ['Texas', 'Utah', 'California']

In [91]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [92]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',
              columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [93]:
frame.ix[['a', 'b', 'c', 'd']]

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [94]:
# pass in new states as columns
frame.ix[['a', 'b', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [95]:
# drop from either axis
data = DataFrame(np.arange(16).reshape((4, 4)),
        index=['Ohio', 'Colorado', 'Utah', 'New York'],
        columns=['one', 'two', 'three', 'four'])

In [96]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [97]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [99]:
# access the cols with axis=1
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [100]:
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [101]:
obj[['b', 'a', 'd']]

b    7.2
a   -5.3
d    4.5
dtype: float64

In [102]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [103]:
 obj[[1, 3]]

b    7.2
c    3.6
dtype: float64

In [104]:
obj[obj < 2]

a   -5.3
dtype: float64

In [106]:
obj['b':'c']

b    7.2
a   -5.3
c    3.6
dtype: float64

In [108]:
obj['b':'c'] = 5
obj

d    4.5
b    5.0
a    5.0
c    5.0
dtype: float64

In [109]:
data = DataFrame(np.arange(16).reshape((4, 4)),
        index=['Ohio', 'Colorado', 'Utah', 'New York'],
        columns=['one', 'two', 'three', 'four'])

In [110]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [111]:
 data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [112]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [113]:
data[:2] 

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [114]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [115]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [116]:
 data[data < 5] = 0

In [117]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [118]:
data.ix['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [119]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [121]:
data.ix[2] 

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [122]:
data.ix[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [126]:
# uniq values, value counts, members
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [127]:
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [128]:
mask = obj.isin(['b', 'c'])

In [129]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [130]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [131]:
# hierarchical indexing
# allows 2 or more index levels on same axis
# allows working with higher dimensional data in lower dim form
data = Series(np.random.randn(10),
    index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
            [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])

In [132]:
data

a  1   -0.089042
   2   -0.712776
   3   -0.169238
b  1   -1.408036
   2   -2.767383
   3    0.984386
c  1   -0.106077
   2   -0.842994
d  2   -0.022082
   3    0.371562
dtype: float64

In [133]:
data.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [134]:
# makes it easy to do 'partial' index
# to concisely select subsets of the data
data['b']

1   -1.408036
2   -2.767383
3    0.984386
dtype: float64

In [135]:
data['b':'c']

b  1   -1.408036
   2   -2.767383
   3    0.984386
c  1   -0.106077
   2   -0.842994
dtype: float64

In [136]:
 data.ix[['b', 'd']]

b  1   -1.408036
   2   -2.767383
   3    0.984386
d  2   -0.022082
   3    0.371562
dtype: float64

In [138]:
# and can also select the inner level
data[:, 2]

a   -0.712776
b   -2.767383
c   -0.842994
d   -0.022082
dtype: float64

In [139]:
# make a DataFrame (suitable for a pivot table) with unstack()
# this hierarchical indexing is useful for shaping the data
data.unstack()

Unnamed: 0,1,2,3
a,-0.089042,-0.712776,-0.169238
b,-1.408036,-2.767383,0.984386
c,-0.106077,-0.842994,
d,,-0.022082,0.371562


In [140]:
# the inverse operation of unstack is stack
data.unstack().stack()

a  1   -0.089042
   2   -0.712776
   3   -0.169238
b  1   -1.408036
   2   -2.767383
   3    0.984386
c  1   -0.106077
   2   -0.842994
d  2   -0.022082
   3    0.371562
dtype: float64

In [141]:
data.unstack().stack() == data

a  1    True
   2    True
   3    True
b  1    True
   2    True
   3    True
c  1    True
   2    True
d  2    True
   3    True
dtype: bool

In [142]:
# with a dataframe, either axis can have a hierarchical index
frame = DataFrame(np.arange(12).reshape((4, 3)),
            index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
            columns=[['Ohio', 'Ohio', 'Colorado'],
            ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [143]:
# hierarchical levels can have names
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [144]:
# and now, we can select groups of columns
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [146]:
# can reorder levels, too
# without altering data, or value structure
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [148]:
# sortlevel does change value order, however
frame.sortlevel(1) 

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [149]:
frame.swaplevel(0, 1).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [150]:
# summary statistics by level!
# specify the key
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [151]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [152]:
# using dataframe's columns
# can use one ore more columns from a dataframe as a row index
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
        'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
        'd': [0, 1, 2, 0, 1, 2, 3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [154]:
# set_index creates a new DataFrame using one or more of its
# columns as the index
frame2 = frame.set_index(['c', 'd'])
frame2
# specify drop=false to leave the cols

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [155]:
# reset index does the opposite
# moves hierarchical indices into the columns
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [158]:
ser = Series(np.arange(3.))
ser
# ser[-1] --> errors out with integer indexing

0    0.0
1    1.0
2    2.0
dtype: float64

In [160]:
ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [161]:
ser2[-1]

2.0

In [163]:
# access by absolute indices, if we have integer indices
ser.ix[:1]

0    0.0
1    1.0
dtype: float64

In [170]:
# use these methods to get absolute indices, regardless of type
frame = DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1])
frame

Unnamed: 0,0,1
2,0,1
0,2,3
1,4,5


In [171]:
frame.irow(0)

  if __name__ == '__main__':


0    0
1    1
Name: 2, dtype: int64

In [173]:
# Panel Data
# a three-dimensional analogue to DataFrame
# to create Panel, can use a dict of DataFrame objects,
# or a three-dimensional ndarray
import pandas.io.data as web
pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk, '1/1/2009', '6/1/2012'))
 for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))