# Getting Started

## Series

In [24]:
import pandas as pd
import numpy as np
stg = '---'*10
obj = pd.Series([4, 7, -5, 3])
print(obj)
print(stg)
print(obj.values)
print(stg)
print(obj.index)

0    4
1    7
2   -5
3    3
dtype: int64
------------------------------
[ 4  7 -5  3]
------------------------------
Int64Index([0, 1, 2, 3], dtype='int64')


In [4]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2['a']                                 # -5
obj2['d'] = 6
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [6]:
print(obj2[obj2 > 0])
print(stg)
obj2 * 2

d    6
b    7
c    3
dtype: int64
------------------------------


d    12
b    14
a   -10
c     6
dtype: int64

In [8]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California      NaN
Ohio          35000
Oregon        16000
Texas         71000
dtype: float64

In [9]:
obj3 + obj4

California       NaN
Ohio           70000
Oregon         32000
Texas         142000
Utah             NaN
dtype: float64

In [10]:
obj4.name = 'population'
obj4.index.name = 'state'

## DataFrames

In [12]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [13]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [15]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [16]:
frame2['state']                # or use frame2.state      to get column and also to set the whole column

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [17]:
frame2.ix['three']            # to get or set row.

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [18]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [20]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
print(frame3)
print(stg)
print(frame3.T)
print(stg)
pd.DataFrame(pop, index=[2001, 2002, 2003])

      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6
------------------------------
        2000  2001  2002
Nevada   NaN   2.4   2.9
Ohio     1.5   1.7   3.6
------------------------------


Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [21]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


## Index Objects
- Index are immutable

In [25]:
index = pd.Index(np.arange(3))
obj2 = pd.Series([1.5, -2.5, 0], index=index)
obj2.index is index

True

## Reindexing

In [26]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2)
print(stg)
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
------------------------------


a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [27]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [28]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [29]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [30]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill', columns=states)
frame.ix[['a', 'b', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


## Dropping entries from an axis

In [31]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
obj.drop(['d', 'c'])

a    0
b    1
e    4
dtype: float64

In [32]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data.drop(['Colorado', 'Ohio'])
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


## Indexing, selection, and filtering

In [33]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj['b']                          #  1.0     is same as obj[1]
obj[2:4]                          
obj[['b', 'a', 'd']]
obj[[1, 3]]
obj[obj < 2]
obj['b':'c']                      # unlike pure python this is end inclusive

b    1
c    2
dtype: float64

In [34]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data['two']
data[['three', 'one']]
data[:2]
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [37]:
print(data < 5)
print(stg)
data[data < 5] = 0
data

            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False
------------------------------


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [38]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [39]:
data.ix[data.three > 5, :3]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


## Arithmetic and data alignment

In [40]:
from pandas import *

In [41]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [42]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2                      # df1.add(df2, fill_value=0) is you dont want NaN

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [43]:
arr = np.arange(12.).reshape((3, 4))
arr[0]                           # array([ 0., 1., 2., 3.])
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [44]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print(frame)
print(stg)
print(series)
frame - series

        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
------------------------------
b    0
d    1
e    2
Name: Utah, dtype: float64


Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [45]:
series3 = frame['d']
print(series3)
frame.sub(series3, axis=0)

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: float64


Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


## Function application and mapping

In [46]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
f = lambda x: x.max() - x.min()
print(frame.apply(f))
frame.apply(f, axis=1)

b    2.752940
d    4.093989
e    2.088112
dtype: float64


Utah      1.782106
Ohio      1.898997
Texas     2.458034
Oregon    3.005060
dtype: float64

## Sorting and ranking

In [47]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()                             # obj.order() to sort by values

a    1
b    2
c    3
d    0
dtype: int64

In [48]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
print(frame.sort_index())
frame.sort_index(axis=1)                     # frame.sort_index(axis=1, ascending=False)  for descending

       d  a  b  c
one    4  5  6  7
three  0  1  2  3


Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [50]:
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame.sort_values(by='b')
frame.sort_values(by=['a', 'b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [51]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())
obj.rank(method='first')                             # obj.rank(ascending=False, method='max')  for descending order

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64


0    6
1    1
2    7
3    4
4    3
5    2
6    5
dtype: float64

In [52]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})
print(frame)
frame.rank(axis=1)

   a    b    c
0  0  4.3 -2.0
1  1  7.0  5.0
2  0 -3.0  8.0
3  1  2.0 -2.5


Unnamed: 0,a,b,c
0,2,3,1
1,1,3,2
2,2,1,3
3,2,3,1


## Axis indexes with duplicate values

In [53]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj['a']

a    0
a    1
dtype: int64

In [54]:
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df.ix['b']

Unnamed: 0,0,1,2
b,0.545981,1.000202,0.709875
b,-2.77793,-0.895496,0.2785


## Summarizing and Computing Descriptive Statistics

In [55]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
            index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
print(df)
print(stg)
print(df.sum())
df.sum(axis=1)                          # df.mean(axis=1, skipna=False) if you dont want to skip NaN

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
------------------------------
one    9.25
two   -5.80
dtype: float64


a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [None]:
# Corelation and Covariance

## Unique Values, Value Counts, and Membership

In [56]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()                         # array([c, a, d, b], dtype=object)
print(obj.value_counts())
print(stg)
mask = obj.isin(['b', 'c'])
print(mask)
print(stg)
obj[mask]

a    3
c    3
b    2
d    1
dtype: int64
------------------------------
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
------------------------------


0    c
5    b
6    b
7    c
8    c
dtype: object

## Handling Missing Data

In [57]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()                           # is same as data[data.notnull()]
data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
print(cleaned)
data

   0    1  2
0  1  6.5  3


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [None]:
data.dropna(how='all')                        # drop only rows

In [58]:
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
df.fillna(0)                           # fills all NA as 0
df.fillna({1: 0.5, 3: -1})             # also with Series you can use data.fillna(data.mean())

Unnamed: 0,0,1,2
0,0.612555,0.5,
1,1.090201,0.5,
2,-0.627395,0.5,
3,-0.261491,0.5,0.110857
4,-1.554112,0.5,-1.478136
5,-0.230231,0.995032,-0.239717
6,0.114588,-1.465437,-0.57285


## Hierarchical Indexing

In [59]:
data = Series(np.random.randn(10), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'], 
                                          [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data

a  1    0.368885
   2    1.870600
   3   -0.071919
b  1    0.849405
   2    0.351460
   3    0.798820
c  1    1.274369
   2   -1.565973
d  2   -0.312078
   3   -0.485897
dtype: float64

In [60]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [62]:
print(data['b'])
print(stg)
print(data['b':'c'])
print(stg)
data[:, 2]

1    0.849405
2    0.351460
3    0.798820
dtype: float64
------------------------------
b  1    0.849405
   2    0.351460
   3    0.798820
c  1    1.274369
   2   -1.565973
dtype: float64
------------------------------


a    1.870600
b    0.351460
c   -1.565973
d   -0.312078
dtype: float64

In [63]:
data.unstack()

Unnamed: 0,1,2,3
a,0.368885,1.8706,-0.071919
b,0.849405,0.35146,0.79882
c,1.274369,-1.565973,
d,,-0.312078,-0.485897


In [66]:
frame = DataFrame(np.arange(12).reshape((4, 3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [67]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


## Reordering and Sorting Levels

In [68]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [69]:
frame.sortlevel(1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [70]:
frame.swaplevel(0,1).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [71]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [72]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


## Using a DataFrame’s Columns

In [74]:
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1), 'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})
print(frame)
frame2 = frame.set_index(['c', 'd'])             # if you want c and d still in data values.
frame2

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1
