#Chapter 5 : pandas basics

Convention:

In [23]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

Series: array-like, but with index, also a dict mapping index to values.

In [None]:
series = Series([value], index=[])
pd.isnull(object) == object.isnull()
pd.notnull(object)

Automatic alignment for indexed data

In [5]:
sdata = {'Ohio':35000, 'Texas':71000, 'Utah':5000, 'Oregon':16000}
s1 = Series(sdata)
s2 = Series(sdata, index=['Ohio','Texas', 'Oregon','Califonia'])
s1+s2

Califonia       NaN
Ohio          70000
Oregon        32000
Texas        142000
Utah            NaN
dtype: float64

In [7]:
s2.name = 'population'
s2.index.name = 'state'
s2

state
Ohio         35000
Texas        71000
Oregon       16000
Califonia      NaN
Name: population, dtype: float64

Dataframe:

In [13]:
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year':[2000, 2001, 2002, 2001, 2002],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data, columns=['year', 'state', 'pop']) # specify order of cols
frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [15]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=[1,2,3,4,5])
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,
2,2001,Ohio,1.7,
3,2002,Ohio,3.6,
4,2001,Nevada,2.4,
5,2002,Nevada,2.9,


In [16]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [17]:
frame2.year #frame2['year']

1    2000
2    2001
3    2002
4    2001
5    2002
Name: year, dtype: int64

In [20]:
frame2.ix[3]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 3, dtype: object

In [26]:
frame2.debt = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,0
2,2001,Ohio,1.7,1
3,2002,Ohio,3.6,2
4,2001,Nevada,2.4,3
5,2002,Nevada,2.9,4


In [28]:
frame2['eastern'] = frame2.state=='Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
1,2000,Ohio,1.5,0,True
2,2001,Ohio,1.7,1,True
3,2002,Ohio,3.6,2,True
4,2001,Nevada,2.4,3,False
5,2002,Nevada,2.9,4,False


In [32]:
del frame2['eastern'] # delete a column

In [35]:
pop = {'Nevade':{2001:2.4, 2002:2.9}, 'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevade,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [36]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevade,,2.4,2.9
Ohio,1.5,1.7,3.6


In [41]:
pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevade':frame3['Nevade'][:2]}
DataFrame(pdata)

Unnamed: 0,Nevade,Ohio
2000,,1.5
2001,2.4,1.7


In [44]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevade,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [45]:
frame3.values # returns an array

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

Index objects : immutable, set-like?

In [47]:
index = pd.Index(np.arange(3))
obj2 = Series([1,2,3], index=index)
obj2.index is index

True

In [49]:
'Ohio' in frame3.columns

True

**Reindexing** : rearranges the data according to new index  
Use `method=ffill/bfill` for forward or backward fill -- only apply to rows  
Use `columns=[]` to specify column reindexing

In [52]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [54]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [55]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

Use `method=ffill/bfill` for forward or backward fill 

In [57]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [58]:
obj3.reindex(range(6), method='bfill')

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

In [61]:
frame = DataFrame(np.arange(9).reshape(3,3), index=['a','c','d'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [63]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [64]:
frame3 = frame.reindex(columns=['Ohio', 'Texas', 'California', 'Utah'])
frame3

Unnamed: 0,Ohio,Texas,California,Utah
a,0,1,2,
c,3,4,5,
d,6,7,8,


In [65]:
frame4 = frame.reindex(['a','b','c','d'], columns=['Ohio', 'Texas', 'California', 'Utah'], method='ffill')
frame4

Unnamed: 0,Ohio,Texas,California,Utah
a,0,1,2,
b,0,1,2,
c,3,4,5,
d,6,7,8,


Dropping entries from an axis

In [67]:
obj = Series(np.arange(5.), index=['a','b','c','d','e'])
new = obj.drop(['a','c'])
new

b    1
d    3
e    4
dtype: float64

In [68]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [71]:
frame2.drop(['b'])

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [70]:
frame2.drop(['Ohio', 'California'], axis=1)

Unnamed: 0,Texas
a,1.0
b,
c,4.0
d,7.0


Slicing : rich label indexing using `ix[]`, binary array

In [73]:
frame2[frame2['California']>=5]

Unnamed: 0,Ohio,Texas,California
c,3,4,5
d,6,7,8


In [74]:
frame2.ix[['c','d'],['Ohio','California']]

Unnamed: 0,Ohio,California
c,3,5
d,6,8


Arithmetic 