In [1]:
import pandas as pd

# Pandas Series Object

In [2]:
# Pandas Series wraps a sequence of values and indices into a 1d array of indexed data
# creating series from a list:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
# values are a numpy array
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [4]:
# index is an array-like object of type pd.Index
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
print(data[1])
print(data[1:3])

0.5
1    0.50
2    0.75
dtype: float64


In [6]:
# index doesn't need to be int, we can use strings
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [7]:
data['b']

0.5

In [8]:
# using nonsequential indices
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=[1, 3, 5, 7])
data[5]

0.75

In [9]:
# series is like a dictionary, but with typed indices and values for efficiency
population_dict = {'CA': 38332521,
                   'TX': 26448193,
                   'NY': 19651127,
                   'FL': 19552860,
                   'IL': 12882135}
population = pd.Series(population_dict)
population

CA    38332521
FL    19552860
IL    12882135
NY    19651127
TX    26448193
dtype: int64

In [10]:
population['CA']

38332521

In [11]:
# unlike dict, Series supports array-style operations like slicing
population['CA':'IL']

CA    38332521
FL    19552860
IL    12882135
dtype: int64

In [12]:
# constructing Series objects

In [13]:
# from a list or NP array, where it defaults to int
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [14]:
## from a scalar, which repeats to fill index
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [15]:
#from a dict, where index defaults to sorted dict keys
pd.Series({2:'a', 1:'b', 3:'c'})

1    b
2    a
3    c
dtype: object

In [16]:
# index can be explicitly set
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

# Pandas DataFrame Object

In [17]:
# DataFrame as generalized NumPy array

# dataframe like a 2-d array with flexible row indices and column names
# df is like a sequence of aligned Series objects in that they share the same index

area_dict = {'CA': 423967,
             'TX': 170312,
             'NY': 149995,
             'FL': 141297,
             'IL': 695662}
area = pd.Series(area_dict)

states = pd.DataFrame({'population': population,
                        'area': area})
states

Unnamed: 0,area,population
CA,423967,38332521
FL,141297,19552860
IL,695662,12882135
NY,149995,19651127
TX,170312,26448193


In [18]:
states.index

Index(['CA', 'FL', 'IL', 'NY', 'TX'], dtype='object')

In [19]:
states.columns

Index(['area', 'population'], dtype='object')

In [20]:
# DataFrame as a specialized dictionary

# dict maps a key to a value, df maps a column name to a Series of column data

states['area']

CA    423967
FL    141297
IL    695662
NY    149995
TX    170312
Name: area, dtype: int64

In [21]:
# note:
# in a NumPy array, data[0] returns the first row
# in a Pandas DF, data['col0'] would return the first column
# so better to think of a DF as a generalized dict than an array

In [22]:
# constructing DataFrame objects

In [23]:
# from a single Series object
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
CA,38332521
FL,19552860
IL,12882135
NY,19651127
TX,26448193


In [24]:
# from a list of dicts
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [25]:
# missing keys will be filled with NaN
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 1, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,1,4.0


In [26]:
# from a dictionary of Series objects
pd.DataFrame({'population': population,
              'area': area})

Unnamed: 0,area,population
CA,423967,38332521
FL,141297,19552860
IL,695662,12882135
NY,149995,19651127
TX,170312,26448193


In [27]:
# from a 2 dimensional NumPy array
import numpy as np
pd.DataFrame(np.random.rand(3,2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.875591,0.025442
b,0.124555,0.376687
c,0.601718,0.661202


In [28]:
# from a numpy structured array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
print(A)
pd.DataFrame(A)

[(0,  0.) (0,  0.) (0,  0.)]


Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


# The Pandas Index Object

In [29]:
# index object can be thought of as either
# an immutable array or as an ordered set

ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [30]:
# index as immutable array
# we can use indexing notation to retrieve values/slices

print(ind[1])
print(ind[::2])

3
Int64Index([2, 5, 11], dtype='int64')


In [31]:
# index attributes
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [32]:
# index is immutable, can't be modified normally
# this will return error:
# ind[1] = 0

In [33]:
# index as ordered set

# Index object follows many conventions used by "set" data structure
# including unions, intersections, differences, and other computations

In [34]:
# intersection
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [35]:
# union
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [36]:
# symmetric difference
indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

# Data Indexing and Selection

In [37]:
# Data Selection in Series

In [38]:
# Series as dictionary

In [39]:
# like dict, series provides mapping from keys to values
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data)
data['b']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


0.5

In [40]:
# we can use dictionary-like expressions
print('a' in data)
print(data.keys())
print(list(data.items()))

True
Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


In [41]:
# series can be modified with dict-like syntax
# extending series by assigning to new index value
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [42]:
# Series as a 1-dimensional array

In [43]:
# Series provides array-style item selection

In [44]:
# slicing by explicit index
data['a':'c']

# note that "c" is included when slicing by explicit index

a    0.25
b    0.50
c    0.75
dtype: float64

In [45]:
# slicing by implicit integer index
data[0:2]

# note that "c" is excluded when slicing by implicit index

a    0.25
b    0.50
dtype: float64

In [46]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [47]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [48]:
# Indexers: loc, iloc, and ix

# these can be confusing
# if Series has explicit integer index, data[1] will use explicit indices
# while a slicing operation like data[1:3] will use implicit Python-style index

# special attributes are provided for consistent indexing

In [49]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [50]:
# explicit index # is used when indexing
data[1]

'a'

In [51]:
# but implicit index (row #) is used when slicing
data[1:3]

3    b
5    c
dtype: object

In [52]:
# loc attribute
# allows indexing and slicing that always references the EXPLICIT index

print( data.loc[1] )
print( data.loc[1:3] )

a
1    a
3    b
dtype: object


In [53]:
# iloc attribute
# allows indexing and slicing that always references the IMPLICIT Python-style index

print( data.iloc[1] )
print( data.iloc[1:3] )

b
3    b
5    c
dtype: object


In [54]:
# ix is a hybrid of the two, will explain below

In [55]:
# in Python, "explicit is better than implicit"
# book recommends loc and iloc for clearer code and to prevent bugs

# Data Selection in DataFrame

In [56]:
# DF acts in ways like a 2d or structured array
# and in other ways like a dictionary of Series structures sharing the same index

In [57]:
# DataFrame as a Dictionary

In [58]:
area = pd.Series({'California': 423967, 'Texas': 695662,'New York': 141297, 'Florida': 170312, 'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [59]:
# access a Series via dictionary-style indexing of column name:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [60]:
# or use attribute-style access with column names (if the names are strings)
data.area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [61]:
# both methods are the same if column name is a string
print(data.area is data['area'])
# this doesn't always work, for example, "pop" is a method of dataframe
print(data.pop is data['pop'])
# use:
# data['pop'] = z
# not:
# data.pop = z

True
False


In [62]:
# adding column using dictionary-like syntax
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [63]:
# DataFrame as a 2-dimensional array

In [64]:
# we can examine underlying array using the values attribute
data.values

array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

In [65]:
# we can do array-like operations...
# Transposing rows and columns
data.T

Unnamed: 0,California,Florida,Illinois,New York,Texas
area,423967.0,170312.0,149995.0,141297.0,695662.0
pop,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41393,114.8061,85.88376,139.0767,38.01874


In [66]:
# but we can't index it like a NumPy array
# passing a single index returns a row
data.values[0]

array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])

In [67]:
# and passing an index accesses a column
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [68]:
# so we can instead use the Pandas indexers

In [69]:
# iloc
print(data.iloc[:3, :2])

              area       pop
California  423967  38332521
Florida     170312  19552860
Illinois    149995  12882135


In [70]:
# loc
print(data.loc[:'IL', :'pop'])

              area       pop
California  423967  38332521
Florida     170312  19552860


In [71]:
# ix allows a hybrid of loc and iloc
data.ix[:3, :'pop']

# but this is deprecated, so I guess not...

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [72]:
# we can combine masking and fancy indexing
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
Florida,19552860,114.806121
New York,19651127,139.076746


In [73]:
# we can set or modify values
data.iloc[0, 2] = 90
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [74]:
# additional indexing conventions

In [75]:
# indexing refers to columns
# slicing refers to rows
data['Florida':'Illlinois']

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [76]:
# slices can also refer to rows by number
data[1:3]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [77]:
# direct masking operations are interpreted row-wise instead of column-wise
data[data.density > 100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


# Operating on Data in Pandas

In [87]:
# Ufuncs: index preservation
# any NumPy ufun will work on Pandas objects
# if we apply a ufunc, the result is another Pandas object with indices preserved

In [88]:
# ufunc operation on Series
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
print(ser)
np.exp(ser)

0    6
1    3
2    7
3    4
dtype: int32


0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [89]:
# ufunc operation on DataFrame
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
print(df)
np.sin(df * np.pi / 4)

   A  B  C  D
0  6  9  2  6
1  7  4  3  7
2  7  2  5  4


Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [90]:
# UFuncs: Index Alignment


In [95]:
# index alignment in Series
area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population')

population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [100]:
# NaN added when both indeces are not present
# we can instead use a "fill value"
population.divide(area, fill_value=0)

# though this causes a divide by zerio situation for NY in this case

Alaska         0.000000
California    90.413926
New York            inf
Texas         38.018740
dtype: float64

In [102]:
# index alignment in DataFrame
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,0,11
1,11,16


In [107]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('CBA'))
B

Unnamed: 0,C,B,A
0,9,4,1
1,3,6,7
2,2,0,3


In [108]:
A + B

Unnamed: 0,A,B,C
0,1.0,15.0,
1,18.0,22.0,
2,,,


In [111]:
# indices are aligned and sorted regardless of the order of the two objects
# we can use fill values to avoid the NaN

# in this case, we'll substute the mean (of all values in A) for any missing values in A
fill = A.stack().mean()
print(fill)
A.add(B, fill_value=fill)

9.5


Unnamed: 0,A,B,C
0,1.0,15.0,18.5
1,18.0,22.0,12.5
2,12.5,9.5,11.5


In [112]:
# Python Operator Pandas Method(s)
# +               add()
# -               sub(), subtract()
# *               mul(), multiply()
# /               truediv(), div(), divide()
# //              floordiv()
# %               mod()
# **              pow()

In [113]:
# Ufuncs: Operations Between DataFrame and Series
# similar to operations between a 2d and 1d NumPy array

In [114]:
A = rng.randint(10, size=(3,4))
A

array([[1, 7, 3, 1],
       [5, 5, 9, 3],
       [5, 1, 9, 1]])

In [117]:
# subtract first row from all rows
A - A[0]
# in NumPy broadcasting, the subtraction is applied row-wise

array([[ 0,  0,  0,  0],
       [ 4, -2,  6,  2],
       [ 4, -6,  6,  0]])

In [116]:
# in Pandas, is also applied row-wise by default
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,4,-2,6,2
2,4,-6,6,0


In [119]:
# we can specify the axis to operate on columns instead
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,-6,0,-4,-6
1,0,0,4,-2
2,4,0,8,0


In [121]:
# like with Series, these operations will align indices
halfrow = df.iloc[0, ::2]
print(halfrow)
df - halfrow

Q    1
S    3
Name: 0, dtype: int32


Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,4.0,,6.0,
2,4.0,,6.0,


# Handling Missing Data