#Chapter 5 : pandas basics

Convention:

In [70]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
%matplotlib inline
import pandas.io.data as web

Series: array-like, but with index, also a dict mapping index to values.

In [None]:
series = Series([value], index=[])
pd.isnull(object) == object.isnull()
pd.notnull(object)

Automatic alignment for indexed data

In [5]:
sdata = {'Ohio':35000, 'Texas':71000, 'Utah':5000, 'Oregon':16000}
s1 = Series(sdata)
s2 = Series(sdata, index=['Ohio','Texas', 'Oregon','Califonia'])
s1+s2

Califonia       NaN
Ohio          70000
Oregon        32000
Texas        142000
Utah            NaN
dtype: float64

In [7]:
s2.name = 'population'
s2.index.name = 'state'
s2

state
Ohio         35000
Texas        71000
Oregon       16000
Califonia      NaN
Name: population, dtype: float64

Dataframe:

In [13]:
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year':[2000, 2001, 2002, 2001, 2002],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data, columns=['year', 'state', 'pop']) # specify order of cols
frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [15]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=[1,2,3,4,5])
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,
2,2001,Ohio,1.7,
3,2002,Ohio,3.6,
4,2001,Nevada,2.4,
5,2002,Nevada,2.9,


In [16]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [17]:
frame2.year #frame2['year']

1    2000
2    2001
3    2002
4    2001
5    2002
Name: year, dtype: int64

In [20]:
frame2.ix[3]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 3, dtype: object

In [26]:
frame2.debt = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,0
2,2001,Ohio,1.7,1
3,2002,Ohio,3.6,2
4,2001,Nevada,2.4,3
5,2002,Nevada,2.9,4


In [28]:
frame2['eastern'] = frame2.state=='Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
1,2000,Ohio,1.5,0,True
2,2001,Ohio,1.7,1,True
3,2002,Ohio,3.6,2,True
4,2001,Nevada,2.4,3,False
5,2002,Nevada,2.9,4,False


In [32]:
del frame2['eastern'] # delete a column

In [35]:
pop = {'Nevade':{2001:2.4, 2002:2.9}, 'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevade,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [36]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevade,,2.4,2.9
Ohio,1.5,1.7,3.6


In [41]:
pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevade':frame3['Nevade'][:2]}
DataFrame(pdata)

Unnamed: 0,Nevade,Ohio
2000,,1.5
2001,2.4,1.7


In [44]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevade,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [45]:
frame3.values # returns an array

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

Index objects : immutable, set-like?

In [47]:
index = pd.Index(np.arange(3))
obj2 = Series([1,2,3], index=index)
obj2.index is index

True

In [49]:
'Ohio' in frame3.columns

True

**Reindexing** : rearranges the data according to new index  
Use `method=ffill/bfill` for forward or backward fill -- only apply to rows  
Use `columns=[]` to specify column reindexing

In [52]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [54]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [55]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

Use `method=ffill/bfill` for forward or backward fill 

In [57]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [58]:
obj3.reindex(range(6), method='bfill')

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

In [61]:
frame = DataFrame(np.arange(9).reshape(3,3), index=['a','c','d'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [63]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [64]:
frame3 = frame.reindex(columns=['Ohio', 'Texas', 'California', 'Utah'])
frame3

Unnamed: 0,Ohio,Texas,California,Utah
a,0,1,2,
c,3,4,5,
d,6,7,8,


In [65]:
frame4 = frame.reindex(['a','b','c','d'], columns=['Ohio', 'Texas', 'California', 'Utah'], method='ffill')
frame4

Unnamed: 0,Ohio,Texas,California,Utah
a,0,1,2,
b,0,1,2,
c,3,4,5,
d,6,7,8,


Dropping entries from an axis

In [67]:
obj = Series(np.arange(5.), index=['a','b','c','d','e'])
new = obj.drop(['a','c'])
new

b    1
d    3
e    4
dtype: float64

In [68]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [71]:
frame2.drop(['b'])

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [70]:
frame2.drop(['Ohio', 'California'], axis=1)

Unnamed: 0,Texas
a,1.0
b,
c,4.0
d,7.0


Slicing : rich label indexing using `ix[]`, binary array

In [73]:
frame2[frame2['California']>=5]

Unnamed: 0,Ohio,Texas,California
c,3,4,5
d,6,7,8


In [74]:
frame2.ix[['c','d'],['Ohio','California']]

Unnamed: 0,Ohio,California
c,3,5
d,6,8


Arithmetic 

In [10]:
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['Utah','Ohio', 'Texas', 'Oregon'])
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [11]:
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6,7.0,8,
Ohio,3,1.0,6,5.0
Oregon,9,,10,11.0
Texas,9,4.0,12,8.0
Utah,0,,1,2.0


Dataframe and series: match the index of series on the column, then braodcast down the row

In [16]:
frame = DataFrame(np.arange(9).reshape(3,3), index=['a','c','d'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [17]:
series = frame.ix[0]
series

Ohio          0
Texas         1
California    2
Name: a, dtype: int32

In [18]:
frame - series

Unnamed: 0,Ohio,Texas,California
a,0,0,0
c,3,3,3
d,6,6,6


In [21]:
series2 = Series(np.arange(3), index=['Ohio', 'Texas','Utah'])
frame + series2

Unnamed: 0,California,Ohio,Texas,Utah
a,,0,2,
c,,3,5,
d,,6,8,


In [36]:
series3 = frame.ix[:,'Ohio']
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [35]:
frame.add(series3, axis=0)

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,6,7,8
d,12,13,14


**Function application and mapping:**  
`df.apply(func, axis=)`  
`df.applymap(element_wise_func)`  
`series.map(func)`

In [38]:
frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
                 index=['Utah','Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.280228,-0.650069,0.682587
Ohio,-1.399214,1.416653,-2.003142
Texas,-0.769408,-0.149664,-0.395108
Oregon,-0.378847,0.582722,-1.973602


In [39]:
f = lambda x : x.max() - x.min()

In [40]:
frame.apply(f)

b    1.679442
d    2.066722
e    2.685729
dtype: float64

In [41]:
frame.apply(f, axis=1)

Utah      1.332656
Ohio      3.419795
Texas     0.619745
Oregon    2.556324
dtype: float64

In [43]:
def f(x):
    return Series([x.min(), x.max()], index=['min','max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.399214,-0.650069,-2.003142
max,0.280228,1.416653,0.682587


In [46]:
format = lambda x: '%.2f' % x
frame.applymap(format) # element-wise apply

Unnamed: 0,b,d,e
Utah,0.28,-0.65,0.68
Ohio,-1.4,1.42,-2.0
Texas,-0.77,-0.15,-0.4
Oregon,-0.38,0.58,-1.97


In [49]:
frame['d'].map(format)

Utah      -0.65
Ohio       1.42
Texas     -0.15
Oregon     0.58
Name: d, dtype: object

Sorting and ranking:  
    `df.sort_index(by=[] , axis= , ascending= )`  
    `series.order()` - sort by value  
    `obj.rank(ascending= , method= , axis= )` - returns the rank/order of elements

In [56]:
obj = Series(range(4), index=list('dabc'))
obj

d    0
a    1
b    2
c    3
dtype: int64

In [57]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [58]:
obj.order()

d    0
a    1
b    2
c    3
dtype: int64

Summary statistics

In [62]:
df = DataFrame([[1.4, np.nan],[7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [66]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [67]:
df.cumsum(skipna=True)

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [68]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [69]:
df.quantile()

one    1.4
two   -2.9
dtype: float64

Correlation / Covariance  
+ df.col1.corr(df.col2.corr)  
+ df.corr() - correlation matrix  
+ df.corrwith(df.col) - series 
+ df.corrwith(df)  - match the column names  

In [83]:
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')

In [85]:
price = DataFrame({tic:data['Adj Close'] for tic, data in all_data.iteritems()})
volume = DataFrame({tic:data['Volume'] for tic, data in all_data.iteritems()})

In [89]:
returns = price.pct_change() # daily return
returns.head()

Unnamed: 0_level_0,AAPL,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-03,,,
2000-01-04,-0.084312,-0.033944,-0.03378
2000-01-05,0.014637,0.035137,0.010544
2000-01-06,-0.086539,-0.017241,-0.033498
2000-01-07,0.047367,-0.004386,0.013068


In [92]:
returns.MSFT.corr(returns.IBM)

0.49597971937350288

In [93]:
returns.MSFT.cov(returns.IBM)

0.00021595766927684753

In [95]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT
AAPL,1.0,0.410012,0.424305
IBM,0.410012,1.0,0.49598
MSFT,0.424305,0.49598,1.0


In [96]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT
AAPL,0.001027,0.000252,0.000309
IBM,0.000252,0.000367,0.000216
MSFT,0.000309,0.000216,0.000516


In [98]:
returns.corrwith(volume)

AAPL   -0.057549
IBM    -0.007892
MSFT   -0.014245
dtype: float64

Unique values, value counts  
+ obj.unique()  
+ obj.value_counts()
+ 

In [102]:
obj = Series(list('cadaabbcc'))
unique = obj.unique()
unique

array(['c', 'a', 'd', 'b'], dtype=object)

In [103]:
obj.value_counts() # == pd.value_counts(obj, sort=True)

c    3
a    3
b    2
d    1
dtype: int64

In [105]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [106]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [108]:
data = DataFrame({'Qu1':[1,3,4,3,4],
                  'Qu2':[2,3,1,2,3],
                 'Qu3':[1,5,2,4,4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [113]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


## Missing data
Filtering missing data:
    + `data.dropna(how='all'/'any', axis=, thresh=)`    
    + `df.fillna(dict to specify value for each column, method='ffill', limit=, inplace=False)`  

In [121]:
data = Series([1, np.nan, 3.5, np.nan, 7])
data.dropna() # same as data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [123]:
data = DataFrame([[1, np.nan, 3.5, np.nan, 7],[1, np.nan, 3.5, np.nan, 7]])
data

Unnamed: 0,0,1,2,3,4
0,1,,3.5,,7
1,1,,3.5,,7


In [124]:
data.dropna() # drop rows that contain any NA

Unnamed: 0,0,1,2,3,4


In [126]:
df.fillna(0)

Unnamed: 0,one,two
a,1.4,0.0
b,7.1,-4.5
c,0.0,0.0
d,0.75,-1.3


In [129]:
df.fillna({'one':'aaa', 'two':'bbb'})

Unnamed: 0,one,two
a,1.4,bbb
b,7.1,-4.5
c,aaa,bbb
d,0.75,-1.3


In [130]:
df.fillna(df.mean()) # fill na with column mean 

Unnamed: 0,one,two
a,1.4,-2.9
b,7.1,-4.5
c,3.083333,-2.9
d,0.75,-1.3


###Hierarchical indexing
Index is array or list of list.  
+ `df.unstack()` and `df.stack  
+ `df.index.names=[]` and `df.column.names=[]`
+ `df.swaplevel()`  
+ `df.sortlevel()`  
+ `df.sum(level='key', axis=)` -level summay stat  
+ `df.set_index([list of column keys], drop=False)`  
  `df.reset_index()` - does the opposite  
+ reliable position indexing: `series.iget_value([position])` and `df.irow/icol([row/col #])`  

In [132]:
data = Series(np.random.randn(10), index=[list('aaabbbccdd'), [1,2,3,1,2,3,1,2,2,3]])
data

a  1   -0.896130
   2    1.168839
   3   -0.985553
b  1   -0.596275
   2   -0.835634
   3   -0.763222
c  1    2.120703
   2   -0.083429
d  2    0.397225
   3    1.487136
dtype: float64

In [133]:
data.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [134]:
data.b

1   -0.596275
2   -0.835634
3   -0.763222
dtype: float64

In [137]:
data[['b','c']]

b  1   -0.596275
   2   -0.835634
   3   -0.763222
c  1    2.120703
   2   -0.083429
dtype: float64

In [138]:
data[:, 2]

a    1.168839
b   -0.835634
c   -0.083429
d    0.397225
dtype: float64

### Reordering and sorting levels

In [143]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
                 index=[list('aabb'),[1,2,1,2]],
                 columns=[['Ohio', 'Ohio', 'Colorado'], ['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [146]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [148]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [153]:
frame.swaplevel(1,0).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [155]:
frame = DataFrame(np.arange(6).reshape(3,2), index=[2,0,1])
frame

Unnamed: 0,0,1
2,0,1
0,2,3
1,4,5


In [156]:
frame.irow(0)

0    0
1    1
Name: 2, dtype: int32

In [158]:
frame.icol([1,0])

Unnamed: 0,1,0
2,1,0
0,3,2
1,5,4


###panel
+ `panel.swapaxes(ax1, ax2, ax3)`  
+ `panel.to_frame()`  
+ `df.to_panel()`

In [159]:
pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk, '1/1/2009', '6/1/2012')) for stk in ['AAPL', 'MSFT', 'DELL']))

In [161]:
pdata

<class 'pandas.core.panel.Panel'>
Dimensions: 3 (items) x 868 (major_axis) x 6 (minor_axis)
Items axis: AAPL to MSFT
Major_axis axis: 2009-01-02 00:00:00 to 2012-06-01 00:00:00
Minor_axis axis: Open to Adj Close

In [164]:
pdata = pdata.swapaxes('items', 'minor_axis')
pdata

<class 'pandas.core.panel.Panel'>
Dimensions: 6 (items) x 868 (major_axis) x 3 (minor_axis)
Items axis: Open to Adj Close
Major_axis axis: 2009-01-02 00:00:00 to 2012-06-01 00:00:00
Minor_axis axis: AAPL to MSFT

In [None]:
pdata['Adj Close']

In [171]:
pdata.ix[:, '6/10/2010', :]

Unnamed: 0,Open,High,Low,Close,Volume,Adj Close
AAPL,244.83997,250.98003,242.199989,250.509979,194089000,33.616038
DELL,13.03,13.1,12.87,13.07,22500800,12.64327
MSFT,25.129999,25.15,24.780001,25.0,78930900,21.82219


In [169]:
pdata.ix['Adj Close', '5/22/2012':, :]

Unnamed: 0_level_0,AAPL,DELL,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-05-22,74.740031,14.58765,27.329995
2012-05-23,76.56368,12.08221,26.73307
2012-05-24,75.860527,12.04351,26.696335
2012-05-25,75.453926,12.05319,26.687151
2012-05-28,,12.05319,
2012-05-29,76.793152,12.24666,27.146325
2012-05-30,77.719062,12.14992,26.944289
2012-05-31,77.525831,11.92743,26.806537
2012-06-01,75.279486,11.67592,26.126961


In [173]:
pdata.ix[:, '5/30/2012':, :].to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,Adj Close
Date,minor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-30,AAPL,569.20005,579.98999,566.55999,579.169998,132357400,77.719062
2012-05-30,DELL,12.59,12.7,12.46,12.56,19787800,12.14992
2012-05-30,MSFT,29.35,29.48,29.120001,29.34,41585500,26.944289
2012-05-31,AAPL,580.740021,581.499985,571.459969,577.730019,122918600,77.525831
2012-05-31,DELL,12.53,12.54,12.33,12.33,19955600,11.92743
2012-05-31,MSFT,29.299999,29.42,28.940001,29.190001,39134000,26.806537
2012-06-01,AAPL,569.159996,572.649956,560.520012,560.990036,130246900,75.279486
2012-06-01,DELL,12.15,12.3,12.045,12.07,19397600,11.67592
2012-06-01,MSFT,28.76,28.959999,28.440001,28.450001,56634300,26.126961


In [174]:
pdata.ix[:, '5/30/2012':, :].to_frame().to_panel()

<class 'pandas.core.panel.Panel'>
Dimensions: 6 (items) x 3 (major_axis) x 3 (minor_axis)
Items axis: Open to Adj Close
Major_axis axis: 2012-05-30 00:00:00 to 2012-06-01 00:00:00
Minor_axis axis: AAPL to MSFT