In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

## Introduction to Pandas Data Structures

### Series

In [2]:
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = Series([6, 7, -5, 3], index = ['d','b','a','c'])

In [6]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [7]:
obj2['c']

3

In [8]:
obj2[['d','b','a']] #use double [] to indicate extracting values based on index

d    6
b    7
a   -5
dtype: int64

In [9]:
obj2[obj2>3] #Use on stock prices (or any value from an asset) compared to a specific value


d    6
b    7
dtype: int64

In [10]:
obj2**2 #Simultaneously calculate all values in the series

d    36
b    49
a    25
c     9
dtype: int64

In [11]:
#play with it
Larger_than_three = obj2[obj2>3]
ltt = Larger_than_three
lttnew = ltt**2
print (lttnew)

d    36
b    49
dtype: int64


In [12]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [13]:
'b' in obj2 #check whether some day's stock price is xxx

True

In [14]:
#Create a dictionary
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3


Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [15]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [16]:
#customize the index incorporated with an exsited dictionary
obj4 = Series(sdata, index = states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [17]:
#to see if our indexes have their exisiting value
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [18]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [19]:
#Combine different series
objall = obj3 + obj4
objall

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [20]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [21]:
#the index of the series can also be altered
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [22]:
#Construct a dataframe
data = {'state': ['Ohio','Ohio','Ohio','Nevada','Nevada'],
       'year': [2000,2001,2002,2001,2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [23]:
#If we specify a sequence of columns, the DataFrame's columns will be exactly what we pass:
DataFrame(data, columns = ['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [24]:
#when passing a column that isn't contained in data, it will appear with NA values in the result
frame2 = DataFrame(data, columns = ['year', 'state', 'pop', 'debt'],
                  index = ['one','two','three','four','five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [25]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [26]:
#A column in a DataFrame can be retrieved as a Series either by a dict-like notation or by attribute
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [27]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [28]:
frame2['year']#no difference from frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [29]:
#Rows can also be retrieved by position or name by a couple of methods, such as the ix indexing field
frame2.ix['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [30]:
# Fill out the empty columns
frame2 ['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [31]:
frame2['debt'] = np.arange(5)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [32]:
# Practice: use the np.arange function to try to assign index to numbers of from 0 to 4
frame2.index = np.arange(5)
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0
1,2001,Ohio,1.7,1
2,2002,Ohio,3.6,2
3,2001,Nevada,2.4,3
4,2002,Nevada,2.9,4


In [33]:
frame2.index = ['one', 'two', 'three', 'four', 'five'] # set the index back to one, two...
# Assign three rows with three rows of numbers on the debt column
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [34]:
# Assign rows to be a column name and then categorize rows into boolean values 
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [35]:
#Use the function of del
del frame2['eastern']

In [36]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [37]:
frame2.index

Index(['one', 'two', 'three', 'four', 'five'], dtype='object')

In [38]:
# Another common form of data is a nested dict format, here we don't need '' in the nested{}:
pop = {'Nevada':{2001: 2.4, 2002: 2.9},
      'Ohio':{2000: 1.5, 2001:1.7, 2002:3.6}}

#### But it doesn't work as it does in the textbook! P118

In [39]:
#Let's try transpose
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [40]:
frame2.T

Unnamed: 0,one,two,three,four,five
year,2000,2001,2002,2001,2002
state,Ohio,Ohio,Ohio,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9
debt,,-1.2,,-1.5,-1.7


In [41]:
pop = {'Nevada':{2001:2.4, 2002:2.9},
      'Ohio':{2000: 1.5, 2001: 1.7, 2002:3.6}}

In [42]:
# Nested Dictionary 
pop = {'Nevada':{2001:2.4, 2002:2.9},
      'Ohio':{2000: 1.5, 2001: 1.7, 2002:3.6}}
p = DataFrame(pop)
p

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [43]:
frame3 = DataFrame(pop)

In [44]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


#### Practice Time: 1. Conventional Dict and Nested Dict; 2. DataFrame and Dict



In [45]:
# Practice time: Make a contrast: Conventional Dictionary 
#Construct a dataframe
data = {'state': ['Ohio','Ohio','Ohio','Nevada','Nevada'],
       'year': [2000,2001,2002,2001,2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [46]:
# Pratice time: Construct a dataframe, compared with 
# Construct Index Method 1
data = {
        'Year': ['2000', '2001', '2002'],
        'Nevada': ['1.7','2.4','2.9'],
        'Ohio': ['1.5', '1.7', '3.6']
       }
dt = DataFrame(data, index = [1,2,3])
# there is no difference if we add '' to the index values here, i.e., it can also be written as 
# dt = DataFrame(data, index = ['1','2','3'])

In [47]:
dt

Unnamed: 0,Nevada,Ohio,Year
1,1.7,1.5,2000
2,2.4,1.7,2001
3,2.9,3.6,2002


In [48]:
# Construct Index Method 2

In [49]:
# Only dataframe can be assigned with a specific index, a dict can't!
data.index = ['a', 'b', 'c']

AttributeError: 'dict' object has no attribute 'index'

In [50]:
# Construct Index Method 2
dt.index = ['a', 'b', 'c']

In [51]:
dt

Unnamed: 0,Nevada,Ohio,Year
a,1.7,1.5,2000
b,2.4,1.7,2001
c,2.9,3.6,2002


In [52]:
# Dicts of series are treated much in the same way:
# Pay Attention to the value existed upon changing the numbers in frame3['Ohio/Nevada'[] 
pdata = {'Ohio': frame3['Ohio'][:-1],
        'Nevada': frame3['Nevada'][:2]}
pdt = DataFrame(pdata)
pdt

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [53]:
frame3.index.name = 'year';
frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [54]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [55]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

Table 5-1. Possible data inputs to DataFrame constructor

Type Notes

1. 2D ndarray: A matrix of data, passing optional row and column labels

2. dict of arrays, lists, or tuples: Each sequence becomes a column in the DataFrame. All sequences must be the same length.

3. NumPy structured/record array: Treated as the “dict of arrays” case

4. dict of Series: Each value becomes a column. Indexes from each Series are unioned together to form theresult’s row index if no explicit index is passed.

5. dict of dicts: Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case.

6. list of dicts or Series: Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels

7. List of lists or tuples: Treated as the “2D ndarray” case

8. Another DataFrame: The DataFrame’s indexes are used unless different ones are passed

9. NumPy MaskedArray: Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result

### Index Objects

In [56]:
obj = Series(range(3), index = ['a,','b','c'])
index = obj.index
index

Index(['a,', 'b', 'c'], dtype='object')

In [57]:
index[1:]

Index(['b', 'c'], dtype='object')

In [58]:
#Index objects are immutable and thus can’t be modified by the user:
index[1] = 'd'
index

TypeError: Index does not support mutable operations

In [59]:
#Immutability is important so that Index objects can be safely shared among data structures:
index = pd.Index(np.arange(3))
index

Int64Index([0, 1, 2], dtype='int64')

In [60]:
obj2 = Series([1.5,-2.5,0], index = index)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [61]:
obj2.index is index

True

In [62]:
#In addition to being array-like, an Index also functions as a fixed-size set:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [63]:
'Ohio' in frame3.columns

True

In [64]:
frame3.index 

Int64Index([2000, 2001, 2002], dtype='int64', name='year')

In [65]:
'2002' in frame3.index

False

Table 5-3. Index methods and properties
Method Description

1. append: Concatenate with additional Index objects, producing a new Index

2. diff: Compute set difference as an Index

3. intersection: Compute set intersection

4. union: Compute set union

5. isin: Compute boolean array indicating whether each value is contained in the passed collection

6. delete: Compute new Index with element at index i deleted

7. drop: Compute new index by deleting passed values

8. insert: Compute new Index by inserting element at index i

9. is_monotonic: Returns True if each element is greater than or equal to the previous element

10. is_unique: Returns True if the Index has no duplicate values

11. unique: Compute the array of unique values in the Index

## Essential Functionality

### Reindexing

In [66]:
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [67]:
#Calling reindex on this Series rearranges the data according to the new index, introducing
#missing values if any index values were not already present:

obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [68]:
obj.reindex(['a','b','c','d','e'], fill_value = 0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The method option allows us to do this, using a method such as ffill which forward fills the values:

In [69]:
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [70]:
obj3.reindex(range(7), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
6    yellow
dtype: object

Up to P123

In [None]:
# 12-18-2016 from page 123

In [73]:
frame = DataFrame(np.arange(9).reshape((3,3)), index = ['a','c','d'],
                 columns = ['Ohio','Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [74]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [76]:
#The columns can be reindexed using the columns keyword:
states =  ['Texas','Utah','California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [79]:
#Both can be reindexed in one shot, though interpolation will only apply row-wise (axis 0):
frame.reindex(index=['a','b','c','d'], method = 'ffill',
                columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [81]:
#As you’ll see soon, reindexing can be done more succinctly by label-indexing with ix:
frame.ix[['a','b','c','d'],states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


Argument Description

index         New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An
Index will be used exactly as is without any copying

method        Interpolation (fill) method, see Table 5-4 for options.
fill_value    Substitute value to use when introducing missing data by reindexing
limit         When forward- or backfilling, maximum size gap to fill
level         Match simple Index on level of MultiIndex, otherwise select subset of
copy          Do not copy underlying data if new index is equivalent to old index. True by default (i.e. always copy data).

### Dropping entries from axis

Dropping one or more entries from an axis is easy if you have an index array or list
without those entries. As that can require a bit of munging and set logic, the drop
method will return a new object with the indicated value or values deleted from an axis:

In [82]:
obj = Series(np.arange(5.), index = ['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [83]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [84]:
obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [85]:
data

{'Nevada': ['1.7', '2.4', '2.9'],
 'Ohio': ['1.5', '1.7', '3.6'],
 'Year': ['2000', '2001', '2002']}

In [86]:
#With DataFrame, index values can be deleted from either axis:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index = ['Ohio','Colorado','Utah','New York'],
                 columns = ['one','two','three','four'])


In [87]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [88]:
data.drop(['Colorado','Ohio'])
#We can only drop rows using [] but not columns in this case.

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [93]:
# If we want to drop columns we need to use () in this case
data.drop('two', axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


### Indexing, selection, and filtering

Series indexing (obj[...]) works analogously to NumPy array indexing, except you can
use the Series’s index values instead of only integers. Here are some examples this:

In [97]:
obj = Series(np.arange(4.), index = ['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [96]:
obj['b']

1.0

In [98]:
obj[1]

1.0

In [99]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [100]:
obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [101]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [102]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [104]:
#Slicing with labels behaves differently than normal Python slicing in that the endpoint is INCLUSIVE:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [105]:
#Setting using these methods works just as you would expect:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [106]:
# As you’ve seen above, indexing into a DataFrame is for retrieving one or more columns 
# either with a single value or sequence:
data = DataFrame(np.arange(16).reshape((4,4)),
                index = ['Ohio','Colorado','Utah','New York'],
                columns = ['one','two','three','four'])

In [107]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [110]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [111]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [113]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [115]:
#This might seem inconsistent to some readers, but this syntax arose out of practicality
#and nothing more. Another use case is in indexing with a boolean DataFrame, such as
#one produced by a scalar comparison:
data <5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [118]:
data[data<5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


This is intended to make DataFrame syntactically more like an ndarray in this case.
For DataFrame label-indexing on the rows, I introduce the special indexing field ix. It
enables you to select a subset of the rows and columns from a DataFrame with NumPylike
notation plus axis labels. As I mentioned earlier, this is also a less verbose way to
do reindexing:

In [119]:
data.ix['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int32

In [121]:
# adjust the sequence of the columns' displaying
data.ix[['Colorado', 'Utah'],[3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [122]:
# inside[] without '' means we want the data from the second row
data.ix[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [123]:
data.ix[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [149]:
# QUESTION: Why is there a 0 even if this is choosing the data that are larger than 5???????
data.ix[data.three>5,:3]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [148]:
#Type                           Notes

#obj[val]                       Select single column or sequence of columns from the DataFrame. Special case conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion).
#obj.ix[val]                    Selects single row of subset of rows from the DataFrame.
#obj.ix[:, val]                 Selects single column of subset of columns.
#obj.ix[val1, val2]             Select both rows and columns.
#reindex method                 Conform one or more axes to new indexes.
#xs method                      Select single row or column as a Series by label.
#icol, irow methods             Select single column or row, respectively, as a Series by integer location.
#get_value, set_value methods   Select single value by row and column label.

### Arithmatic and Data Alignment

One of the most important pandas features is the behavior of arithmetic between objects
with different indexes. When adding together objects, if any index pairs are not
the same, the respective index in the result will be the union of the index pairs. Let’s
look at a simple example:

In [151]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index = ['a','c','d','e'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [153]:
s2 = Series([-2.1,3.6,-1.5,4,3.1], index = ['a','c','e','f','g'])
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [154]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [164]:
# Pay attention to how  differently the columns are assigned here, 
df1 = DataFrame(np.arange(9).reshape((3,3)), columns = list('bcd'),
               index = ['Ohio','Texas','Colorado'])
df1

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [163]:
df2 = DataFrame(np.arange(12).reshape((4,3)), columns = list('bde'),
               index = ['Utah','Ohio','Texas','Oregan'])
df2

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregan,9,10,11


In [166]:
#Adding these together returns a DataFrame whose index and columns are the unions of the ones in each DataFrame:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregan,,,,
Texas,9.0,,12.0,
Utah,,,,


#### Arithmatic methods with fill values

In arithmetic operations between differently-indexed objects, you might want to fill
with a special value, like 0, when an axis label is found in one object but not the other:

In [168]:
df1 = DataFrame(np.arange(12).reshape((3,4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20).reshape((4,5)), columns = list('abcde'))

In [173]:
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [172]:
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [175]:
#Adding these together results in NA values in the locations that don’t overlap:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [179]:
#Using the add method on df1, I pass df2 and an argument to fill_value:
df1.add(df2, fill_value=0)
# both df1 and df2 remain unchanged in display after this line of code

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [177]:
# Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill value:
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


In [180]:
#Method    Description

#add       Method for addition (+)
#sub       Method for subtraction (-)
#div       Method for division (/)
#mul       Method for multiplication (*)

#### Operations between DataFrame and Series

In [181]:
# Actually this is already a matrix
arr = np.arange(12.).reshape((3,4))

arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [185]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [187]:
# matrix minus a matrix
arr - arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [191]:
#This is referred to as broadcasting and is explained in more detail in Chapter 12. Operations
#between a DataFrame and a Series are similar:

frame = DataFrame(np.arange(12).reshape((4,3)), columns = list('bde'),
                 index = ['Utah','Ohio','Texas','Oregan'])
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregan,9,10,11


In [199]:
# Remember frame.ix[0] is without any '' in the [], meaning that this 0 means data rows.
series = frame.ix[0]
series

b    0
d    1
e    2
Name: Utah, dtype: int32

In [195]:
#By default, arithmetic between DataFrame and Series matches the index of the Series
#on the DataFrame's columns, broadcasting down the rows:
frame - series

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregan,9,9,9


In [200]:
# If an index value is not found in either the DataFrame’s columns or the Series’s index,
# the objects will be reindexed to form the union:
series2 = Series(range(3), index = ['b','e','f'])
series2

b    0
e    1
f    2
dtype: int32

In [202]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregan,9,10,11


In [201]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregan,9.0,,12.0,


In [203]:
# If you want to instead broadcast over the columns, matching on the rows, you have to
# use one of the arithmetic methods. For example:
series3 = frame['d']

frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregan,9,10,11


In [204]:
series3

Utah       1
Ohio       4
Texas      7
Oregan    10
Name: d, dtype: int32

In [209]:
# Pay attention here, when we use the add/sub/div/mul between two matrices, 
# we are actually applying matrix's arithmetics, instead of incorporating just two sets of certain rows or columns  
frame.sub(series3, axis = 0)

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregan,-1,0,1


The axis number that you pass is the axis to match on. In this case we mean to match
on the DataFrame’s row index and broadcast across.

### Function application and mapping

NumPy ufuncs (element-wise array methods) work fine with pandas objects:

In [210]:
frame = DataFrame(np.random.randn(4,3), columns = list('bde'),
                 index = ['Utah','Ohio','Texas','Oregan'])
frame

Unnamed: 0,b,d,e
Utah,-0.713769,0.926654,-1.374987
Ohio,1.055022,-1.711584,0.694311
Texas,-0.807054,1.649815,-0.465422
Oregan,-2.398832,0.170156,-1.277636


In [211]:
# Take the average values of the above frame
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.713769,0.926654,1.374987
Ohio,1.055022,1.711584,0.694311
Texas,0.807054,1.649815,0.465422
Oregan,2.398832,0.170156,1.277636


Another frequent operation is applying a function on 1D arrays to each column or row.
DataFrame’s apply method does exactly this:

In [213]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    3.453854
d    3.361399
e    2.069298
dtype: float64

In [220]:
# Here I finally figured out that axis = 0 means calculate along the columns for one row, while 
# axis = 1 means calculating along the rows for one column.
frame.apply(f, axis = 0)

b    3.453854
d    3.361399
e    2.069298
dtype: float64

Many of the most common array statistics (like sum and mean) are DataFrame methods,
so using apply is not necessary.
The function passed to apply need not return a scalar value, it can also return a Series
with multiple values:

In [222]:
def f(x):
    return Series([x.min(), x.max()], index = ['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-2.398832,-1.711584,-1.374987
max,1.055022,1.649815,0.694311


Element-wise Python functions can be used, too. Suppose you wanted to compute a
formatted string from each floating point value in frame. You can do this with applymap:

In [225]:
# Here we use lambda x: '%.nf' % x to determine decimal points number 
format = lambda x: '%.3f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.714,0.927,-1.375
Ohio,1.055,-1.712,0.694
Texas,-0.807,1.65,-0.465
Oregan,-2.399,0.17,-1.278


The reason for the name applymap is that Series has a map method for applying an element-
wise function:

In [227]:
frame['e'].map(format)

Utah      -1.375
Ohio       0.694
Texas     -0.465
Oregan    -1.278
Name: e, dtype: object

### Sorting and Ranking

Sorting a data set by some criterion is another important built-in operation. To sort
lexicographically by row or column index, use the sort_index method, which returns
a new, sorted object:

In [232]:
# Series only has one dimension so we cannot have both versions of results for axis = 0 and axis = 1 
obj = Series(range(4), index = ['d','a','b','c'])
obj.sort_index(axis = 0)

a    1
b    2
c    3
d    0
dtype: int32

In [235]:
# But with a DataFrame, we can now sort by index on either axis:
frame = DataFrame(np.arange(8).reshape((2,4)), index = ['three', 'one'],
                 columns = ['d','a','b','c'])
frame.sort_index(axis=0)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [236]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [238]:
# The data is sorted in ascending order by default, but can be sorted in descending order, too:
frame.sort_index(axis =1, ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [242]:
#To sort a Series by its values, use its order method:
obj = Series([4,7,-3,2])
obj.order()

  app.launch_new_instance()


2   -3
3    2
0    4
1    7
dtype: int64

In [244]:
# so we use sort_values
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [247]:
#Any missing values are sorted to the end of the Series by default:
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.order()

  app.launch_new_instance()


4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [250]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

On DataFrame, you may want to sort by the values in one or more columns. To do so,
pass one or more column names to the by option:

In [251]:
frame = DataFrame({'b': [4,7,-3,2], 'a':[0,1,0,1]})
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [252]:
frame.sort_index(by = ['a','b'])

  if __name__ == '__main__':


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [253]:
# Use sort_values() as the new version of code
frame.sort_values(by = ['a','b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


Ranking is closely related to sorting, assigning ranks from one through the number of
valid data points in an array. It is similar to the indirect sort indices produced by
numpy.argsort, except that ties are broken according to a rule. The rank methods for
Series and DataFrame are the place to look; by default rank breaks ties by assigning
each group the mean rank:

In [259]:
# What does this mean?
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [258]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [261]:
#Ranks can also be assigned according to the order they’re observed in the data:
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [262]:
# Naturally, you can rank in descending order, too:
obj.rank(ascending = False, method = 'max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [264]:
frame = DataFrame({'b':[4.3, 7, -3, 2], 'a':[0,1,0,1], 
                   'c':[-2, 5, 8, -2.5]})
frame.rank(axis = 1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [266]:
#Table 5-8. Tie-breaking methods with rank
#Method Description
#'average' Default: assign the average rank to each entry in the equal group.
#'min' Use the minimum rank for the whole group.
#'max' Use the maximum rank for the whole group.
#'first' Assign ranks in the order the values appear in the data.

### Axis indexes with duplicate values

Up until now all of the examples I’ve showed you have had unique axis labels (index
values). While many pandas functions (like reindex) require that the labels be unique,
it’s not mandatory. Let’s consider a small Series with duplicate indices:

In [268]:
obj = Series(range(5), index = ['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [269]:
# The index’s is_unique property can tell you whether its values are unique or not:
obj.index.is_unique

False

Data selection is one of the main things that behaves differently with duplicates. Indexing a value with multiple entries returns a Series while single entries return a scalar value:


In [270]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [271]:
obj['a']

a    0
a    1
dtype: int32

In [272]:
obj['c']

4

In [274]:
# The same logic extends to indexing rows in a DataFrame:
df = DataFrame(np.random.randn(4,3), index = ['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,0.770722,1.017407,0.418794
a,0.307911,-0.179527,-0.960354
b,0.689622,1.251309,0.098223
b,0.483784,-0.559309,0.207554


In [275]:
# ix addresses DataFrame
df.ix['b']

Unnamed: 0,0,1,2
b,0.689622,1.251309,0.098223
b,0.483784,-0.559309,0.207554


## Summarizing and Computing Descriptive Statstics

pandas objects are equipped with a set of common mathematical and statistical methods.
Most of these fall into the category of reductions or summary statistics, methods
that extract a single value (like the sum or mean) from a Series or a Series of values from
the rows or columns of a DataFrame. Compared with the equivalent methods of vanilla
NumPy arrays, they are all built from the ground up to exclude missing data. Consider
a small DataFrame:

In [277]:
df = DataFrame([[1.4, np.nan],[7.1, -4.5],
              [np.nan, np.nan], [0.75, -1.3]],
              index = ['a','b','c','d'],
              columns = ['one','two'])

In [278]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [279]:
# Calling DataFrame’s sum method returns a Series containing column sums:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [280]:
# Passing axis=1 sums over the rows instead:
df.sum(axis = 1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [281]:
# NA values are excluded unless the entire slice (row or column in this case) is NA. This can be disabled using the skipna option:
df.mean(axis = 1, skipna = False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [283]:
#Table 5-9. Options for reduction methods

#Method         Description
#axis           Axis to reduce over. 0 for DataFrame’s rows and 1 for columns.
#skipna         Exclude missing values, True by default.
#level          Reduce grouped by level if the axis is hierarchically-indexed (MultiIndex).

In [287]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [286]:
#Some methods, like idxmin and idxmax, return indirect statistics like the index value
#where the minimum or maximum values are attained:
df.idxmax()

one    b
two    d
dtype: object

In [288]:
# Other methods are accumulations:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [290]:
# Another type of method is neither a reduction nor an accumulation. describe is one
# such example, producing multiple summary statistics in one shot:

df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [292]:
# On non-numeric data, describe produces alternate summary statistics:
obj = Series(['a','a','b','c']*4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [293]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [294]:
# Table 5-10. Descriptive and summary statistics

#Method         Description

#count          Number of non-NA values
#describe       Compute set of summary statistics for Series or each DataFrame column
#min, max       Compute minimum and maximum values
#argmin, argmax Compute index locations (integers) at which minimum or maximum value obtained, respectively
#idxmin, idxmax Compute index values at which minimum or maximum value obtained, respectively
#quantile       Compute sample quantile ranging from 0 to 1
#sum            Sum of values
#mean           Mean of values
#median         Arithmetic median (50% quantile) of values
#mad            Mean absolute deviation from mean value
#var            Sample variance of values
#std            Sample standard deviation of values
#skew           Sample skewness (3rd moment) of values
#kurt           Sample kurtosis (4th moment) of values
#cumsum         Cumulative sum of values
#cummin, cummax Cumulative minimum or maximum of values, respectively
#cumprod        Cumulative product of values
#diff           Compute 1st arithmetic difference (useful for time series)
#pct_change     Compute percent changes

### Correlation and Covariance (Project 1 - Correlation Matrix)

Some summary statistics, like correlation and covariance, are computed from pairs of
arguments. Let’s consider some DataFrames of stock prices and volumes obtained from
Yahoo! Finance:

In [309]:
# Note that this is different from the texbook because of the updated issue.
import pandas_datareader.data as wb
from datetime import datetime

In [316]:
all_data = {}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = wb.DataReader(ticker, data_source = 'yahoo', start = datetime(2000,1,1), end = datetime(2010,1,1))
    
price = DataFrame({tic: data['Adj Close']
                  for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume']
                   for tic, data in all_data.items()})

In [317]:
returns = price.pct_change()

In [320]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-24,0.034339,0.011117,0.004385,0.002587
2009-12-28,0.012294,0.007098,0.013326,0.005484
2009-12-29,-0.011861,-0.005571,-0.003477,0.007058
2009-12-30,0.012147,0.005376,0.005461,-0.013699
2009-12-31,-0.0043,-0.004416,-0.012597,-0.015504


In [322]:
# The corr method of Series computes the correlation of the overlapping, non-NA,
# aligned-by-index values in two Series. Relatedly, cov computes the covariance:
returns.MSFT.corr(returns.IBM)

0.4959796261031541

In [324]:
returns.MSFT.cov(returns.IBM)

0.00021595760076743126

In [325]:
# DataFrame’s corr and cov methods, on the other hand, return a full correlation or
# covariance matrix as a DataFrame, respectively:

In [326]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.470676,0.410011,0.424305
GOOG,0.470676,1.0,0.390689,0.443587
IBM,0.410011,0.390689,1.0,0.49598
MSFT,0.424305,0.443587,0.49598,1.0


In [328]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.001027,0.000303,0.000252,0.000309
GOOG,0.000303,0.00058,0.000142,0.000205
IBM,0.000252,0.000142,0.000367,0.000216
MSFT,0.000309,0.000205,0.000216,0.000516


In [329]:
# Using DataFrame’s corrwith method, you can compute pairwise correlations between 
# a DataFrame’s columns or rows with another Series or DataFrame. Passing a Series
# returns a Series with the correlation value computed for each column:

In [330]:
returns.corrwith(returns.IBM)

AAPL    0.410011
GOOG    0.390689
IBM     1.000000
MSFT    0.495980
dtype: float64

In [333]:
# Passing a DataFrame computes the correlations of matching column names. Here I 
# compute correlations of percent changes with volume:
returns.corrwith(volume, axis = 0)

AAPL   -0.057549
GOOG    0.062647
IBM    -0.007892
MSFT   -0.014245
dtype: float64

In [336]:
# Passing axis=1 does things row-wise instead. In all cases, the data points are aligned by
# label before computing the correlation.

In [332]:
# QUESTION: What does this mean?
returns.corrwith(volume, axis = 1)

Date
2000-01-03         NaN
2000-01-04   -0.924163
2000-01-05   -0.601233
2000-01-06   -0.999755
2000-01-07    0.985766
2000-01-10   -0.957692
2000-01-11   -0.975631
2000-01-12   -0.934752
2000-01-13    0.998264
2000-01-14    0.933154
2000-01-18    0.978913
2000-01-19   -0.210586
2000-01-20    0.990028
2000-01-21   -0.857808
2000-01-24   -0.999716
2000-01-25    0.999940
2000-01-26    0.080826
2000-01-27    0.994140
2000-01-28   -0.761596
2000-01-31    0.684650
2000-02-01    0.252326
2000-02-02   -0.724360
2000-02-03    0.820709
2000-02-04    0.971153
2000-02-07    0.990099
2000-02-08   -0.971971
2000-02-09   -0.400781
2000-02-10   -0.421461
2000-02-11   -0.999503
2000-02-14    0.492223
                ...   
2009-11-18   -0.018993
2009-11-19   -0.991618
2009-11-20    0.798004
2009-11-23    0.695534
2009-11-24   -0.864314
2009-11-25   -0.152006
2009-11-27   -0.733395
2009-11-30   -0.871465
2009-12-01   -0.785859
2009-12-02    0.416278
2009-12-03    0.299599
2009-12-04   -0.803847
2009-1

### Unique Values, Value Counts, and Membership

In [None]:
# Another class of related methods extracts information about the values contained in a
# one-dimensional Series. To illustrate these, consider this example:

In [341]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [342]:
# The first function is unique, which gives you an array of the unique values in a Series:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [343]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [344]:
# The Series is sorted by value in descending order as a convenience. value_counts is also
# available as a top-level pandas method that can be used with any array or sequence:
pd.value_counts(obj.values, sort = False)

c    3
d    1
a    3
b    2
dtype: int64

In [346]:
# As a contrast with this sort=True
pd.value_counts(obj.values, sort = True)

a    3
c    3
b    2
d    1
dtype: int64

In [347]:
# Lastly, isin is responsible for vectorized set membership and can be very useful in
# filtering a data set down to a subset of values in a Series or column in a DataFrame:

In [349]:
mask = obj.isin(['b','c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [350]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [351]:
#Table 5-11. Unique, value counts, and binning methods

#Method       Description

#isin         Compute boolean array indicating whether each Series value is contained in the passed sequence of values.
#unique       Compute array of unique values in a Series, returned in the order observed.
#value_counts Return a Series containing unique values as its index and frequencies as its values, ordered count in
#descending order.

In [4]:
# In some cases, you may want to compute a histogram on multiple related columns in a DataFrame. Here’s an example:
data = DataFrame({'Qu1':[1,3,4,3,4],
                  'Qu2':[2,3,1,2,3],
                  'Qu3':[1,5,2,4,4]})

In [5]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [7]:
#Passing pandas.value_counts to this DataFrame's apply function gives:
# This set of code means for values of 1,2,3,4,5 above in all columns of Qu1, 2, 3, how many counts are there for 
# each category of value.
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


## Handling Missing Data

Missing data is common in most data analysis applications. One of the goals in designing
pandas was to make working with missing data as painless as possible. For
example, all of the descriptive statistics on pandas objects exclude missing data as
you’ve seen earlier in the chapter.

pandas uses the floating point value NaN (Not a Number) to represent missing data in
both floating as well as in non-floating point arrays. It is just used as a sentinel that can
be easily detected:

In [8]:
string_data = Series(['aardvark','artichoke',np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [9]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [11]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [13]:
#The built-in Python None value is also treated as NA in object arrays:
string_data[0] = None
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [14]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

I do not claim that pandas's NA representation is optimal, but it is simple and reasonably
consistent. It's the best solution, with good all-around performance characteristics
and a simple API, that I could concoct in the absence of a true NA data type or bit
pattern in NumPy's data types. Ongoing development work in NumPy may change this
in the future.

In [17]:
#Table 5-12. NA handling methods

#Argument      Description
#dropna        Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much
#              missing data to tolerate.
#fillna        Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.
#isnull        Return like-type object containing boolean values indicating which values are missing / NA.
#notnull       Negation of isnull.

### Filtering Out Missing Data

In [18]:
#You have a number of options for filtering out missing data. While doing it by hand is
#always an option, dropna can be very helpful. On a Series, it returns the Series with only
#the non-null data and index values:

In [19]:
from numpy import nan as NA

In [21]:
data = Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [22]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [23]:
#Naturally, you could have computed this yourself by boolean indexing:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [25]:
#With DataFrame objects, these are a bit more complex. You may want to drop rows
#or columns which are all NA or just those containing any NAs. dropna by default drops
#any row containing a missing value:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
                [NA, NA, NA], [NA, 6.5, 3.]])

In [28]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [27]:
# Drop all rows as long as they have any NAN
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [29]:
# Passing how='all' will only drop rows that are all NA:
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [31]:
# Dropping columns in the same way is only a matter of passing axis=1:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [33]:
data.dropna(how = 'all', axis = 1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [None]:
# A related way to filter out DataFrame rows tends to concern time series data. Suppose
# you want to keep only rows containing a certain number of observations. You can indicate this with the thresh argument:

In [36]:
df = DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,2.007711,-0.333501,-1.30543
1,1.624074,0.353801,0.9131
2,-1.4626,1.582806,0.226373
3,1.657469,1.451519,1.044497
4,0.271909,-1.26922,0.887545
5,-0.757788,-0.001633,-0.287228
6,0.732717,-1.100215,0.484933


In [39]:
df.ix[:4, 1] = NA
df.ix[:2, 2] = NA

In [40]:
df

Unnamed: 0,0,1,2
0,2.007711,,
1,1.624074,,
2,-1.4626,,
3,1.657469,,1.044497
4,0.271909,,0.887545
5,-0.757788,-0.001633,-0.287228
6,0.732717,-1.100215,0.484933


In [44]:
# It means at least thresh = n number of values in every row won't be NaN
df.dropna(thresh = 3)

Unnamed: 0,0,1,2
0,2.007711,,
1,1.624074,,
2,-1.4626,,
3,1.657469,,1.044497
4,0.271909,,0.887545
5,-0.757788,-0.001633,-0.287228
6,0.732717,-1.100215,0.484933


### Filling in Missing Data (very useful in dealing with missing financial data)

Rather than filtering out missing data (and potentially discarding other data along with
it), you may want to fill in the “holes” in any number of ways. For most purposes, the
fillna method is the workhorse function to use. Calling fillna with a constant replaces
missing values with that value:

In [46]:
df

Unnamed: 0,0,1,2
0,2.007711,,
1,1.624074,,
2,-1.4626,,
3,1.657469,,1.044497
4,0.271909,,0.887545
5,-0.757788,-0.001633,-0.287228
6,0.732717,-1.100215,0.484933


In [48]:
df.fillna(0)

Unnamed: 0,0,1,2
0,2.007711,0.0,0.0
1,1.624074,0.0,0.0
2,-1.4626,0.0,0.0
3,1.657469,0.0,1.044497
4,0.271909,0.0,0.887545
5,-0.757788,-0.001633,-0.287228
6,0.732717,-1.100215,0.484933


In [51]:
# Calling fillna with a dict you can use a different fill value for each column:
# In this case 1 and 2 represents the columns
df.fillna({1:0.5, 2:-1})

Unnamed: 0,0,1,2
0,2.007711,0.5,-1.0
1,1.624074,0.5,-1.0
2,-1.4626,0.5,-1.0
3,1.657469,0.5,1.044497
4,0.271909,0.5,0.887545
5,-0.757788,-0.001633,-0.287228
6,0.732717,-1.100215,0.484933


In [52]:
# fillna returns a new object, but you can modify the existing object in place:
# always returns a reference to the filled object
_ = df.fillna(0, inplace = True)

In [53]:
df

Unnamed: 0,0,1,2
0,2.007711,0.0,0.0
1,1.624074,0.0,0.0
2,-1.4626,0.0,0.0
3,1.657469,0.0,1.044497
4,0.271909,0.0,0.887545
5,-0.757788,-0.001633,-0.287228
6,0.732717,-1.100215,0.484933


In [55]:
# The same interpolation methods available for reindexing can be used with fillna:
df = DataFrame(np.random.randn(6,3))
df

Unnamed: 0,0,1,2
0,-1.262279,1.17955,0.298709
1,-0.661688,-2.405389,-0.006452
2,0.22383,0.594741,2.238974
3,-0.645561,0.177309,0.11103
4,-0.698968,1.043519,0.362158
5,-0.088991,-1.379364,0.072865


In [57]:
df.ix[2:,1] = NA; df.ix[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-1.262279,1.17955,0.298709
1,-0.661688,-2.405389,-0.006452
2,0.22383,,2.238974
3,-0.645561,,0.11103
4,-0.698968,,
5,-0.088991,,


In [58]:
# Can be useful for filling yesterday's price in the financial data
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2
0,-1.262279,1.17955,0.298709
1,-0.661688,-2.405389,-0.006452
2,0.22383,-2.405389,2.238974
3,-0.645561,-2.405389,0.11103
4,-0.698968,-2.405389,0.11103
5,-0.088991,-2.405389,0.11103


In [59]:
# Determines how many NAN would you wanna fill after the value-existed one?
df.fillna(method = 'ffill', limit = 2)

Unnamed: 0,0,1,2
0,-1.262279,1.17955,0.298709
1,-0.661688,-2.405389,-0.006452
2,0.22383,-2.405389,2.238974
3,-0.645561,-2.405389,0.11103
4,-0.698968,,0.11103
5,-0.088991,,0.11103


In [61]:
# This is even more useful/realistic when filling missing financial data
# With fillna you can do lots of other things with a little creativity. For example, you
# might pass the mean or median value of a Series:
data = Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [62]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [63]:
#See Table 5-13 for a reference on fillna.

#Table 5-13. fillna function arguments

#Argument        Description

#value           Scalar value or dict-like object to use to fill missing values
#method          Interpolation, by default 'ffill' if function called with no other arguments
#axis            Axis to fill on, default axis=0
#inplace         Modify the calling object without producing a copy
#limit           For forward and backward filling, maximum number of consecutive periods to fill

## Hierarchical Indexing

Hierarchical indexing is an important feature of pandas enabling you to have multiple
(two or more) index levels on an axis. Somewhat abstractly, it provides a way for you
to work with higher dimensional data in a lower dimensional form. Let’s start with a
simple example; create a Series with a list of lists or arrays as the index:

In [65]:
data = Series(np.random.randn(10),
             index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                    [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data

a  1    0.952910
   2    0.914505
   3   -0.722376
b  1   -0.574567
   2    0.758247
   3   -0.037696
c  1   -2.806209
   2    0.897143
d  2    0.092710
   3   -0.494764
dtype: float64

In [66]:
# What you’re seeing is a prettified view of a Series with a MultiIndex as its index. The
# “gaps” in the index display mean “use the label directly above”:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [68]:
# With a hierarchically-indexed object, so-called partial indexing is possible, enabling
# you to concisely select subsets of the data:
data['b']

1   -0.574567
2    0.758247
3   -0.037696
dtype: float64

In [69]:
data['b':'c']

b  1   -0.574567
   2    0.758247
   3   -0.037696
c  1   -2.806209
   2    0.897143
dtype: float64

In [70]:
data.ix[['b','d']]

b  1   -0.574567
   2    0.758247
   3   -0.037696
d  2    0.092710
   3   -0.494764
dtype: float64

In [72]:
data

a  1    0.952910
   2    0.914505
   3   -0.722376
b  1   -0.574567
   2    0.758247
   3   -0.037696
c  1   -2.806209
   2    0.897143
d  2    0.092710
   3   -0.494764
dtype: float64

In [71]:
#Selection is even possible in some cases from an “inner” level:
data[:, 2]

a    0.914505
b    0.758247
c    0.897143
d    0.092710
dtype: float64

In [73]:
#Hierarchical indexing plays a critical role in reshaping data and group-based operations
#like forming a pivot table. For example, this data could be rearranged into a DataFrame
#using its unstack method:
data.unstack()

Unnamed: 0,1,2,3
a,0.95291,0.914505,-0.722376
b,-0.574567,0.758247,-0.037696
c,-2.806209,0.897143,
d,,0.09271,-0.494764


In [74]:
# The inverse operation of unstack is stack:
data.unstack().stack()

a  1    0.952910
   2    0.914505
   3   -0.722376
b  1   -0.574567
   2    0.758247
   3   -0.037696
c  1   -2.806209
   2    0.897143
d  2    0.092710
   3   -0.494764
dtype: float64

In [3]:
# With a DataFrame, either axis can have a hierarchical index:
frame = DataFrame(np.arange(12).reshape((4,3)),
                 index = [['a','a','b','b'], [1,2,1,2]],
                 columns = [['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [4]:
# The hierarchical levels can have names (as strings or any Python objects). If so, these
# will show up in the console output (don’t confuse the index names with the axis labels!):
frame.index.names = ['key1','key2']

In [5]:
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [6]:
# With partial column indexing you can similarly select groups of columns:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


### Reordering and Sorting Levels

At times you will need to rearrange the order of the levels on an axis or sort the data
by the values in one specific level. The swaplevel takes two level numbers or names and
returns a new object with the levels interchanged (but the data is otherwise unaltered):

In [7]:
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [8]:
# sortlevel, on the other hand, sorts the data (stably) using only the values in a single
# level. When swapping levels, it’s not uncommon to also use sortlevel so that the result
# is lexicographically sorted:
frame.sortlevel(1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [9]:
frame.swaplevel(0,1).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### Summary Statistics by Level
Many descriptive and summary statistics on DataFrame and Series have a level option
in which you can specify the level you want to sum by on a particular axis. Consider
the above DataFrame; we can sum by level on either the rows or columns like so:

In [10]:
frame.sum(level = 'key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [11]:
frame.sum(level = 'color', axis = 1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


Under the hood, this utilizes pandas’s groupby machinery which will be discussed in
more detail later in the book.

### Using a DataFrame’s Columns

It’s not unusual to want to use one or more columns from a DataFrame as the row
index; alternatively, you may wish to move the row index into the DataFrame’s columns.
Here’s an example DataFrame:

In [13]:
# To put it in a simpler way to say, we would like to use some of the columns as indexes.
frame = DataFrame({'a':range(7), 'b': range(7, 0, -1), 
                  'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                  'd': [0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [15]:
#DataFrame’s set_index function will create a new DataFrame using one or more of its
#columns as the index:
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [16]:
# By default the columns are removed from the DataFrame, though you can leave them in:
frame.set_index(['c','d'], drop = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [17]:
# reset_index, on the other hand, does the opposite of set_index; the hierarchical index
# levels are are moved into the columns:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


## Other pandas Topics

Here are some additional topics that may be of use to you in your data travels.

### Integer Indexing

In [19]:
# Working with pandas objects indexed by integers is something that often trips up new
# users due to some differences with indexing semantics on built-in Python data structures like lists and tuples. 
# For example, you would not expect the following code to generate an error:
ser = Series(np.arange(3.))
ser[-1]

KeyError: -1

In this case, pandas could “fall back” on integer indexing, but there’s not a safe and
general way (that I know of) to do this without introducing subtle bugs. Here we have
an index containing 0, 1, 2, but inferring what the user wants (label-based indexing or
position-based) is difficult::

In [20]:
ser

0    0
1    1
2    2
dtype: int32

In [21]:
# On the other hand, with a non-integer index, there is no potential for ambiguity:
ser2 = Series(np.arange(3.), index = ['a','b','c'])
ser2[-1]

2.0

In [22]:
# To keep things consistent, if you have an axis index containing indexers, data selection
# with integers will always be label-oriented. This includes slicing with ix, too:
ser.ix[:1]

0    0
1    1
dtype: int32

In [23]:
#In cases where you need reliable position-based indexing regardless of the index type,
#you can use the iget_value method from Series and irow and icol methods from DataFrame:
ser3 = Series(range(3), index = [-5, 1, 3])
ser3

-5    0
 1    1
 3    2
dtype: int32

In [24]:
ser3.iget_value(2)

  if __name__ == '__main__':


2

In [27]:
frame = DataFrame(np.arange(6).reshape((3,2)), index = [2,0,1])
frame

Unnamed: 0,0,1
2,0,1
0,2,3
1,4,5


### Panel Data

While not a major topic of this book, pandas has a Panel data structure, which you can
think of as a three-dimensional analogue of DataFrame. Much of the development focus
of pandas has been in tabular data manipulations as these are easier to reason about,and hierarchical indexing makes using truly N-dimensional arrays unnecessary in a lot
of cases.

For further details, see page 153