#Chapter 5 : pandas basics

Convention:

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
%matplotlib inline

Series: array-like, but with index, also a dict mapping index to values.

In [None]:
series = Series([value], index=[])
pd.isnull(object) == object.isnull()
pd.notnull(object)

Automatic alignment for indexed data

In [5]:
sdata = {'Ohio':35000, 'Texas':71000, 'Utah':5000, 'Oregon':16000}
s1 = Series(sdata)
s2 = Series(sdata, index=['Ohio','Texas', 'Oregon','Califonia'])
s1+s2

Califonia       NaN
Ohio          70000
Oregon        32000
Texas        142000
Utah            NaN
dtype: float64

In [7]:
s2.name = 'population'
s2.index.name = 'state'
s2

state
Ohio         35000
Texas        71000
Oregon       16000
Califonia      NaN
Name: population, dtype: float64

Dataframe:

In [13]:
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year':[2000, 2001, 2002, 2001, 2002],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data, columns=['year', 'state', 'pop']) # specify order of cols
frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [15]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=[1,2,3,4,5])
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,
2,2001,Ohio,1.7,
3,2002,Ohio,3.6,
4,2001,Nevada,2.4,
5,2002,Nevada,2.9,


In [16]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [17]:
frame2.year #frame2['year']

1    2000
2    2001
3    2002
4    2001
5    2002
Name: year, dtype: int64

In [20]:
frame2.ix[3]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 3, dtype: object

In [26]:
frame2.debt = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
1,2000,Ohio,1.5,0
2,2001,Ohio,1.7,1
3,2002,Ohio,3.6,2
4,2001,Nevada,2.4,3
5,2002,Nevada,2.9,4


In [28]:
frame2['eastern'] = frame2.state=='Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
1,2000,Ohio,1.5,0,True
2,2001,Ohio,1.7,1,True
3,2002,Ohio,3.6,2,True
4,2001,Nevada,2.4,3,False
5,2002,Nevada,2.9,4,False


In [32]:
del frame2['eastern'] # delete a column

In [35]:
pop = {'Nevade':{2001:2.4, 2002:2.9}, 'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevade,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [36]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevade,,2.4,2.9
Ohio,1.5,1.7,3.6


In [41]:
pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevade':frame3['Nevade'][:2]}
DataFrame(pdata)

Unnamed: 0,Nevade,Ohio
2000,,1.5
2001,2.4,1.7


In [44]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevade,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [45]:
frame3.values # returns an array

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

Index objects : immutable, set-like?

In [47]:
index = pd.Index(np.arange(3))
obj2 = Series([1,2,3], index=index)
obj2.index is index

True

In [49]:
'Ohio' in frame3.columns

True

**Reindexing** : rearranges the data according to new index  
Use `method=ffill/bfill` for forward or backward fill -- only apply to rows  
Use `columns=[]` to specify column reindexing

In [52]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [54]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [55]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

Use `method=ffill/bfill` for forward or backward fill 

In [57]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [58]:
obj3.reindex(range(6), method='bfill')

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

In [61]:
frame = DataFrame(np.arange(9).reshape(3,3), index=['a','c','d'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [63]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [64]:
frame3 = frame.reindex(columns=['Ohio', 'Texas', 'California', 'Utah'])
frame3

Unnamed: 0,Ohio,Texas,California,Utah
a,0,1,2,
c,3,4,5,
d,6,7,8,


In [65]:
frame4 = frame.reindex(['a','b','c','d'], columns=['Ohio', 'Texas', 'California', 'Utah'], method='ffill')
frame4

Unnamed: 0,Ohio,Texas,California,Utah
a,0,1,2,
b,0,1,2,
c,3,4,5,
d,6,7,8,


Dropping entries from an axis

In [67]:
obj = Series(np.arange(5.), index=['a','b','c','d','e'])
new = obj.drop(['a','c'])
new

b    1
d    3
e    4
dtype: float64

In [68]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [71]:
frame2.drop(['b'])

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [70]:
frame2.drop(['Ohio', 'California'], axis=1)

Unnamed: 0,Texas
a,1.0
b,
c,4.0
d,7.0


Slicing : rich label indexing using `ix[]`, binary array

In [73]:
frame2[frame2['California']>=5]

Unnamed: 0,Ohio,Texas,California
c,3,4,5
d,6,7,8


In [74]:
frame2.ix[['c','d'],['Ohio','California']]

Unnamed: 0,Ohio,California
c,3,5
d,6,8


Arithmetic 

In [10]:
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['Utah','Ohio', 'Texas', 'Oregon'])
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [11]:
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6,7.0,8,
Ohio,3,1.0,6,5.0
Oregon,9,,10,11.0
Texas,9,4.0,12,8.0
Utah,0,,1,2.0


Dataframe and series: match the index of series on the column, then braodcast down the row

In [16]:
frame = DataFrame(np.arange(9).reshape(3,3), index=['a','c','d'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [17]:
series = frame.ix[0]
series

Ohio          0
Texas         1
California    2
Name: a, dtype: int32

In [18]:
frame - series

Unnamed: 0,Ohio,Texas,California
a,0,0,0
c,3,3,3
d,6,6,6


In [21]:
series2 = Series(np.arange(3), index=['Ohio', 'Texas','Utah'])
frame + series2

Unnamed: 0,California,Ohio,Texas,Utah
a,,0,2,
c,,3,5,
d,,6,8,


In [36]:
series3 = frame.ix[:,'Ohio']
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [35]:
frame.add(series3, axis=0)

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,6,7,8
d,12,13,14


**Function application and mapping:**  
`df.apply(func, axis=)`  
`df.applymap(element_wise_func)`  
`series.map(func)`

In [38]:
frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
                 index=['Utah','Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.280228,-0.650069,0.682587
Ohio,-1.399214,1.416653,-2.003142
Texas,-0.769408,-0.149664,-0.395108
Oregon,-0.378847,0.582722,-1.973602


In [39]:
f = lambda x : x.max() - x.min()

In [40]:
frame.apply(f)

b    1.679442
d    2.066722
e    2.685729
dtype: float64

In [41]:
frame.apply(f, axis=1)

Utah      1.332656
Ohio      3.419795
Texas     0.619745
Oregon    2.556324
dtype: float64

In [43]:
def f(x):
    return Series([x.min(), x.max()], index=['min','max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.399214,-0.650069,-2.003142
max,0.280228,1.416653,0.682587


In [46]:
format = lambda x: '%.2f' % x
frame.applymap(format) # element-wise apply

Unnamed: 0,b,d,e
Utah,0.28,-0.65,0.68
Ohio,-1.4,1.42,-2.0
Texas,-0.77,-0.15,-0.4
Oregon,-0.38,0.58,-1.97


In [49]:
frame['d'].map(format)

Utah      -0.65
Ohio       1.42
Texas     -0.15
Oregon     0.58
Name: d, dtype: object

Sorting and ranking:  
    `df.sort_index(by=[] , axis= , ascending= )`  
    `series.order()` - sort by value  
    `obj.rank(ascending= , method= , axis= )` - returns the rank/order of elements

In [56]:
obj = Series(range(4), index=list('dabc'))
obj

d    0
a    1
b    2
c    3
dtype: int64

In [57]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [58]:
obj.order()

d    0
a    1
b    2
c    3
dtype: int64

Summary statistics

In [62]:
df = DataFrame([[1.4, np.nan],[7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [66]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [67]:
df.cumsum(skipna=True)

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [68]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3
